├── Apache Airflow ├── Welcome_DAG.py ├── dag_preprocessing.py ├── population_dag.py └── wiki_dag.py ├── Classification Prediction Scikit Learn ├── Bank Customer Churn Prediction.ipynb ├── Bank Customer Churn.ipynb ├── Bank Loan Approval Exploratory Data Analysis.ipynb ├── Bank Turnover Prediction.ipynb ├── Customer Churn Prediction.ipynb ├── Customer Credit Risk Prediction.ipynb ├── Customer Personality Prediction to Boost Marketing Campaign.ipynb ├── Customer Satisfaction in Airline Prediction.ipynb ├── Fraudulent Claim on Cars Physical Damage Prediction.ipynb ├── Loan Default Prediction.ipynb ├── Loan Defaults Prediction.ipynb ├── Loan Prediction Analytics Vidhya Competition.ipynb ├── Loan Prediction Analytics Vidhya.ipynb ├── Loan Prediction Based on Customer Behavior.ipynb ├── Loan Prediction.ipynb ├── Predict CLTV of a customer.ipynb ├── Predict Customer Clicked Ads Classification.ipynb ├── Predict if a client will subscribe to a term deposit.ipynb └── Travel Insurance Prediction.ipynb ├── End to End Data Science Project ├── End to End Brazilian E-Commerce Analysis.ipynb └── End to End Customer Churn and Sales Analysis.ipynb ├── Flourish └── README.md ├── KNIME Project ├── Knime Simple Data Preprocessing.JPG └── README.md ├── Langchain LLM ├── LangChain_Chroma.ipynb ├── Langchain_Analyze_CSV.ipynb ├── Langchain_Analyze_PDF.ipynb ├── README.md ├── gemini_web_langchain.py ├── langchain_complete │ ├── file.csv │ ├── file.docx │ ├── file.pdf │ ├── file.pptx │ ├── file.xlsx │ └── langchain_streamlit.py └── langchain_youtube.py ├── LlamaIndex └── llamastreamlit.py ├── MySQL ├── Data Science Salary Query.sql ├── INNER JOIN COMBINATION.sql ├── README.md ├── SQL JOIN.sql ├── STUDENTS PERFORMANCE.sql ├── SUPERSTORE DATA ANALYSIS.sql ├── Sample - Superstore - Wanda.xlsx - Orders.csv ├── Students_Performance_mv.csv ├── VIRTUAL INTERNSHIP QUERIES.sql ├── ds_salaries.csv ├── exam score analysis.sql ├── excercise 1.sql ├── sakila-dvd-rental.sql └── yellow_tlc_apr2022_1k.csv ├── Natural Language Processing ├── Anies_Sentiment_Analysis.ipynb ├── RUU_DPR_2020_2024.ipynb ├── Sentiment_Analisis_Prabowo.ipynb └── emotion_streamlit.py ├── Power BI ├── pbi1.JPG └── pbi2.JPG ├── PySpark ├── Insurance_Claim_Pyspark.ipynb └── PySpark_Data_Preprocessing.ipynb ├── R Language ├── calculate.R ├── coba.R └── portfolio.R ├── README.md ├── Regression Prediction Scikit Learn ├── Ford Car Price Prediction.ipynb ├── Honda Price Prediction.ipynb ├── House Price Prediction for Kaggle Competition.ipynb ├── House Rent Price Prediction.ipynb ├── Media Campaign Cost Prediction.ipynb ├── Medical Insurance Cost Prediction.ipynb ├── Melbourne Housing Price Prediction.ipynb ├── NY Rental Pricing Prediction.ipynb ├── Rain Prediction in Australian Coursera.ipynb ├── Salary Prediction.ipynb ├── Salary prediction based on country and race.ipynb ├── Software Industry Salary Prediction.ipynb ├── Sport Car Price Prediction.ipynb ├── USA Real Estate Price Prediction.ipynb └── Used Vehicle Price Prediction.ipynb ├── Snowflake Cloud ├── README.md ├── Snowflake_Python_Connector.ipynb ├── Snowflake_Snowpark_Session.ipynb └── Snowpark_Data_Pipeline_and_Transformation_Covid.ipynb ├── Streamlit-Web-Application-main ├── README.md ├── __pycache__ │ ├── flask.cpython-311.pyc │ └── pandasai.cpython-311.pyc ├── auto_sentiment_analysis_twitter.py ├── chat_with_your_csv.py ├── cheatgpt.py ├── compare.py ├── complete_pack.py ├── diagnosis.py ├── ecommerce_clustering_llm.py ├── fraud_analysis_llm.py ├── indonesia-bert-sentiment-classification │ └── config.json ├── llmpandas.py ├── pdf_comparer.py ├── pdf_document_analysis.py ├── table_scraper_analysis.py └── web_scrape.py ├── Tableau └── Dashboard 1.png ├── Tensorflow ├── ANTAM_Stock_Price_Prediction.ipynb ├── Classify_Mineral_Stone.ipynb ├── GOTO_Stock_Price.ipynb ├── Insurance_Claim_Fraud_with_GAN.ipynb └── Insurance_Claim_Tensorflow.ipynb └── site └── en └── gemini-api └── docs └── model-tuning └── python.ipynb /Apache Airflow/Welcome_DAG.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | from airflow.operators.empty import EmptyOperator 5 | 6 | my_dag = DAG( 7 | dag_id="my_dag_name", 8 | start_date=datetime.datetime(2021, 1, 1), 9 | schedule="@daily", 10 | ) 11 | EmptyOperator(task_id="task", dag=my_dag) -------------------------------------------------------------------------------- /Apache Airflow/dag_preprocessing.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.providers.mysql.hooks.mysql import MySqlHook 3 | from datetime import datetime 4 | from airflow.operators.python import PythonOperator 5 | 6 | # Replace with your actual connection ID 7 | connection_id = 'mysql' 8 | 9 | def test_mysql_connection(): 10 | try: 11 | # Get the connection from Airflow 12 | mysql_hook = MySqlHook(mysql_conn_id=connection_id) 13 | 14 | # Attempt a simple connection test (e.g., ping the server) 15 | with mysql_hook.get_conn() as conn: 16 | cursor = conn.cursor() 17 | cursor.execute("SELECT * FROM marketing.customer;") 18 | result = cursor.fetchone() 19 | 20 | if result: 21 | print("Connection to MySQL successful!") 22 | else: 23 | print("Connection test failed!") 24 | 25 | except Exception as e: 26 | print(f"Error connecting to MySQL: {e}") 27 | 28 | with DAG(dag_id='test_mysql_connection', 29 | start_date=datetime(2024, 4, 15), 30 | schedule_interval=None) as dag: 31 | 32 | test_connection_task = PythonOperator( 33 | task_id='test_connection', 34 | python_callable=test_mysql_connection 35 | ) 36 | -------------------------------------------------------------------------------- /Apache Airflow/population_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.utils.dates import days_ago 3 | from airflow.operators.python_operator import PythonOperator 4 | from bs4 import BeautifulSoup # For web scraping 5 | import requests 6 | 7 | # Define default arguments 8 | default_args = { 9 | 'owner': 'airflow', 10 | 'start_date': days_ago(1), # Start yesterday 11 | 'schedule_interval': '@daily', # Run daily 12 | } 13 | 14 | 15 | def scrape_worldometer(ti): # Inject the TaskInstance object 16 | """ 17 | Scrapes Worldometer website for population data and stores in XCom. 18 | """ 19 | url = 'https://www.worldometers.info/world-population/' 20 | response = requests.get(url) 21 | soup = BeautifulSoup(response.content, 'html.parser') 22 | 23 | # Target elements using updated selectors 24 | births_today = soup.find('span', class_='rts-counter', rel='births_today').text.strip() 25 | deaths_today = soup.find('span', class_='rts-counter', rel='dth1s_today').text.strip() 26 | 27 | # Store data in XCom for retrieval by downstream tasks 28 | ti.xcom_push( 29 | key='worldometer_data', 30 | value={ 31 | 'births_today': births_today, 32 | 'deaths_today': deaths_today 33 | } 34 | ) 35 | 36 | # Define the DAG 37 | with DAG( 38 | dag_id='worldometer_scraper', 39 | default_args=default_args, 40 | ) as dag: 41 | 42 | # Scrape data task 43 | scrape_task = PythonOperator( 44 | task_id='scrape_worldometer', 45 | python_callable=scrape_worldometer, # Pass the function with TaskInstance injection 46 | ) 47 | -------------------------------------------------------------------------------- /Apache Airflow/wiki_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.utils.dates import days_ago 3 | from airflow.operators.python_operator import PythonOperator 4 | from bs4 import BeautifulSoup 5 | import requests 6 | 7 | # Define default arguments 8 | default_args = { 9 | 'owner': 'airflow', 10 | 'start_date': days_ago(1), # Start yesterday 11 | 'schedule_interval': '@daily', # Run daily 12 | } 13 | 14 | 15 | def scrape_wiki_content(ti): 16 | """ 17 | Scrapes content from Albert Einstein's Wikipedia page and stores it in XCom. 18 | """ 19 | url = 'https://en.wikipedia.org/wiki/Albert_Einstein' 20 | response = requests.get(url) 21 | soup = BeautifulSoup(response.content, 'html.parser') 22 | 23 | # Target all paragraphs within the main content section (can be adjusted) 24 | content_elements = soup.find_all('p', class_=None) # Find all paragraphs without a class 25 | 26 | # Combine the text content of all paragraphs 27 | content_text = '\n'.join([p.get_text(strip=True) for p in content_elements]) 28 | 29 | # Store the content in XCom for retrieval by downstream tasks 30 | ti.xcom_push( 31 | key='einstein_wiki_content', 32 | value=content_text 33 | ) 34 | 35 | 36 | # Define the DAG 37 | with DAG( 38 | dag_id='wiki_einstein_scraper', 39 | default_args=default_args, 40 | ) as dag: 41 | 42 | # Scrape data task 43 | scrape_task = PythonOperator( 44 | task_id='scrape_wiki_content', 45 | python_callable=scrape_wiki_content, # Pass the function with TaskInstance injection 46 | ) 47 | -------------------------------------------------------------------------------- /Flourish/README.md: -------------------------------------------------------------------------------- 1 | # Link for Flourish Visualization 2 | 3 | 1. Loan Default Analysis : https://public.flourish.studio/story/2119154/ 4 | 2. Superstore Sales Analysis : https://public.flourish.studio/story/2117963/ 5 | -------------------------------------------------------------------------------- /KNIME Project/Knime Simple Data Preprocessing.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/KNIME Project/Knime Simple Data Preprocessing.JPG -------------------------------------------------------------------------------- /KNIME Project/README.md: -------------------------------------------------------------------------------- 1 | # KNIME Project 2 | 3 | 1. Simple Data Preprocessing explanation : https://www.linkedin.com/posts/michael-wiryaseputra_knime-datapreprocessing-datascience-activity-7184053754592129024-eqzs?utm_source=share&utm_medium=member_desktop 4 | -------------------------------------------------------------------------------- /Langchain LLM/README.md: -------------------------------------------------------------------------------- 1 | # Langchain LLM 2 | This repository is for all of my Langchain project, modify this code if you want to use the real time poject 3 | 4 | ## 1. Langchain Analyze CSV 5 | This project is analyzing the CSV file with Langchain CSV Agent. The user will ask anything about the CSV Dataset and then Langchain will query it and answer based on the question 6 | 7 | ## 2. Langchain Analyze PDF 8 | This project analyzing the context of the PDF File with Langchain and then it will answer based on the users question 9 | 10 | ## 3. Langchain Analyze Youtube Video 11 | This project analyzing the context of the Youtube Video with Langchain and then it will answer based on the users question 12 | 13 | ## 4. Langchain Analyze Website 14 | This project analyzing the context of the Website with Langchain and then it will answer based on the users question 15 | -------------------------------------------------------------------------------- /Langchain LLM/gemini_web_langchain.py: -------------------------------------------------------------------------------- 1 | from langchain_google_genai import ChatGoogleGenerativeAI 2 | from langchain_google_genai import GoogleGenerativeAIEmbeddings 3 | from langchain_community.document_loaders import WebBaseLoader 4 | from langchain.chains import StuffDocumentsChain 5 | from langchain.chains.llm import LLMChain 6 | from langchain.prompts import PromptTemplate 7 | import google.generativeai as genai 8 | 9 | #genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 10 | 11 | #Initialize Model 12 | llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 13 | 14 | #Load the blog 15 | loader = WebBaseLoader("https://thenewstack.io/the-building-blocks-of-llms-vectors-tokens-and-embeddings/") 16 | docs = loader.load() 17 | 18 | #Define the Summarize Chain 19 | template = """Write a concise summary of the following: 20 | "{text}" 21 | CONCISE SUMMARY:""" 22 | 23 | prompt = PromptTemplate.from_template(template) 24 | 25 | llm_chain = LLMChain(llm=llm, prompt=prompt) 26 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 27 | 28 | #Invoke Chain 29 | response=stuff_chain.invoke(docs) 30 | print(response["output_text"]) -------------------------------------------------------------------------------- /Langchain LLM/langchain_complete/file.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.docx -------------------------------------------------------------------------------- /Langchain LLM/langchain_complete/file.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.pdf -------------------------------------------------------------------------------- /Langchain LLM/langchain_complete/file.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.pptx -------------------------------------------------------------------------------- /Langchain LLM/langchain_complete/file.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.xlsx -------------------------------------------------------------------------------- /Langchain LLM/langchain_complete/langchain_streamlit.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | # Define functions for each page 4 | def langchain_pdf(): 5 | st.title("Langchain PDF Text Analysis") 6 | from langchain_google_genai import ChatGoogleGenerativeAI 7 | from langchain_community.document_loaders import PyPDFLoader 8 | from langchain.chains import StuffDocumentsChain 9 | from langchain.chains.llm import LLMChain 10 | from langchain.prompts import PromptTemplate 11 | import asyncio 12 | import nest_asyncio 13 | nest_asyncio.apply() 14 | 15 | # Initialize Model 16 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 17 | 18 | # Input for PDF file 19 | uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) 20 | 21 | # Input for the question 22 | question = st.text_input("Enter your question:") 23 | 24 | if st.button("Analyze"): 25 | if uploaded_file is not None: 26 | # Save the uploaded PDF file with the name "file.pdf" 27 | with open("file.pdf", "wb") as f: 28 | f.write(uploaded_file.getvalue()) 29 | 30 | # Load the PDF file 31 | loader = PyPDFLoader("file.pdf") 32 | docs = loader.load_and_split() 33 | 34 | # Define the Summarize Chain 35 | template = question + """ Write a concise summary of the following: 36 | "{text}" 37 | CONCISE SUMMARY:""" 38 | 39 | prompt = PromptTemplate.from_template(template) 40 | 41 | llm_chain = LLMChain(llm=llm, prompt=prompt) 42 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 43 | 44 | # Invoke Chain 45 | response = stuff_chain.invoke(docs) 46 | summary = response["output_text"] 47 | 48 | # Display the summary 49 | st.header("Summary:") 50 | st.write(summary) 51 | else: 52 | st.error("Please upload a PDF file.") 53 | 54 | 55 | def langchain_doc(): 56 | st.title("Langchain Microsoft Word File Analysis") 57 | from langchain_google_genai import ChatGoogleGenerativeAI 58 | from langchain_community.document_loaders import Docx2txtLoader 59 | from langchain.chains import StuffDocumentsChain 60 | from langchain.chains.llm import LLMChain 61 | from langchain.prompts import PromptTemplate 62 | import asyncio 63 | import nest_asyncio 64 | nest_asyncio.apply() 65 | 66 | # Initialize Model 67 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 68 | 69 | # Input for PDF file 70 | uploaded_file = st.file_uploader("Upload PDF", type=["docx"]) 71 | 72 | # Input for the question 73 | question = st.text_input("Enter your question:") 74 | 75 | if st.button("Analyze"): 76 | if uploaded_file is not None: 77 | # Save the uploaded PDF file with the name "file.pdf" 78 | with open("file.docx", "wb") as f: 79 | f.write(uploaded_file.getvalue()) 80 | 81 | # Load the PDF file 82 | loader = Docx2txtLoader("file.docx") 83 | docs = loader.load_and_split() 84 | 85 | # Define the Summarize Chain 86 | template = question + """ Write a concise summary of the following: 87 | "{text}" 88 | CONCISE SUMMARY:""" 89 | 90 | prompt = PromptTemplate.from_template(template) 91 | 92 | llm_chain = LLMChain(llm=llm, prompt=prompt) 93 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 94 | 95 | # Invoke Chain 96 | response = stuff_chain.invoke(docs) 97 | summary = response["output_text"] 98 | 99 | # Display the summary 100 | st.header("Summary:") 101 | st.write(summary) 102 | else: 103 | st.error("Please upload a Micosoft Word file.") 104 | 105 | 106 | def langchain_excel(): 107 | st.title("Langchain Microsoft Excel File Analysis") 108 | from langchain_google_genai import ChatGoogleGenerativeAI 109 | from langchain_community.document_loaders import UnstructuredExcelLoader 110 | from langchain.chains import StuffDocumentsChain 111 | from langchain.chains.llm import LLMChain 112 | from langchain.prompts import PromptTemplate 113 | import asyncio 114 | import nest_asyncio 115 | nest_asyncio.apply() 116 | 117 | # Initialize Model 118 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 119 | 120 | # Input for PDF file 121 | uploaded_file = st.file_uploader("Upload PDF", type=["xlsx"]) 122 | 123 | # Input for the question 124 | question = st.text_input("Enter your question:") 125 | 126 | if st.button("Analyze"): 127 | if uploaded_file is not None: 128 | # Save the uploaded PDF file with the name "file.pdf" 129 | with open("file.xlsx", "wb") as f: 130 | f.write(uploaded_file.getvalue()) 131 | 132 | # Load the PDF file 133 | loader = UnstructuredExcelLoader("file.xlsx", mode="elements") 134 | docs = loader.load() 135 | 136 | # Define the Summarize Chain 137 | template = question + """ Write a concise summary of the following: 138 | "{text}" 139 | CONCISE SUMMARY:""" 140 | 141 | prompt = PromptTemplate.from_template(template) 142 | 143 | llm_chain = LLMChain(llm=llm, prompt=prompt) 144 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 145 | 146 | # Invoke Chain 147 | response = stuff_chain.invoke(docs) 148 | summary = response["output_text"] 149 | 150 | # Display the summary 151 | st.header("Summary:") 152 | st.write(summary) 153 | else: 154 | st.error("Please upload a Excel file.") 155 | 156 | def langchain_ppt(): 157 | st.title("Langchain Microsoft Power Point File Analysis") 158 | from langchain_google_genai import ChatGoogleGenerativeAI 159 | from langchain_community.document_loaders import UnstructuredPowerPointLoader 160 | from langchain.chains import StuffDocumentsChain 161 | from langchain.chains.llm import LLMChain 162 | from langchain.prompts import PromptTemplate 163 | import asyncio 164 | import nest_asyncio 165 | nest_asyncio.apply() 166 | 167 | # Initialize Model 168 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 169 | 170 | # Input for PDF file 171 | uploaded_file = st.file_uploader("Upload PDF", type=["pptx"]) 172 | 173 | # Input for the question 174 | question = st.text_input("Enter your question:") 175 | 176 | if st.button("Analyze"): 177 | if uploaded_file is not None: 178 | # Save the uploaded PDF file with the name "file.pdf" 179 | with open("file.pptx", "wb") as f: 180 | f.write(uploaded_file.getvalue()) 181 | 182 | # Load the PDF file 183 | loader = UnstructuredPowerPointLoader("file.pptx", mode="elements") 184 | docs = loader.load_and_split() 185 | 186 | # Define the Summarize Chain 187 | template = question + """ Write a concise summary of the following: 188 | "{text}" 189 | CONCISE SUMMARY:""" 190 | 191 | prompt = PromptTemplate.from_template(template) 192 | 193 | llm_chain = LLMChain(llm=llm, prompt=prompt) 194 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 195 | 196 | # Invoke Chain 197 | response = stuff_chain.invoke(docs) 198 | summary = response["output_text"] 199 | 200 | # Display the summary 201 | st.header("Summary:") 202 | st.write(summary) 203 | else: 204 | st.error("Please upload a Excel file.") 205 | 206 | def langchain_csv(): 207 | st.title("Langchain CSV File Analysis") 208 | from langchain_google_genai import ChatGoogleGenerativeAI 209 | from langchain_community.document_loaders.csv_loader import CSVLoader 210 | from langchain.chains import StuffDocumentsChain 211 | from langchain.chains.llm import LLMChain 212 | from langchain.prompts import PromptTemplate 213 | import asyncio 214 | import nest_asyncio 215 | nest_asyncio.apply() 216 | 217 | # Initialize Model 218 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 219 | 220 | # Input for PDF file 221 | uploaded_file = st.file_uploader("Upload PDF", type=["csv"]) 222 | 223 | # Input for the question 224 | question = st.text_input("Enter your question:") 225 | 226 | if st.button("Analyze"): 227 | if uploaded_file is not None: 228 | # Save the uploaded PDF file with the name "file.pdf" 229 | with open("file.csv", "wb") as f: 230 | f.write(uploaded_file.getvalue()) 231 | 232 | # Load the PDF file 233 | loader = CSVLoader(file_path="file.csv") 234 | docs = loader.load() 235 | 236 | # Define the Summarize Chain 237 | template = question + """ Write a concise summary of the following: 238 | "{text}" 239 | CONCISE SUMMARY:""" 240 | 241 | prompt = PromptTemplate.from_template(template) 242 | 243 | llm_chain = LLMChain(llm=llm, prompt=prompt) 244 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 245 | 246 | # Invoke Chain 247 | response = stuff_chain.invoke(docs) 248 | summary = response["output_text"] 249 | 250 | # Display the summary 251 | st.header("Summary:") 252 | st.write(summary) 253 | else: 254 | st.error("Please upload a CSV file.") 255 | 256 | def langchain_web(): 257 | st.title("Langchain Web Content Analysis") 258 | from langchain_google_genai import ChatGoogleGenerativeAI 259 | from langchain_community.document_loaders import WebBaseLoader 260 | from langchain.chains import StuffDocumentsChain 261 | from langchain.chains.llm import LLMChain 262 | from langchain.prompts import PromptTemplate 263 | import asyncio 264 | import nest_asyncio 265 | nest_asyncio.apply() 266 | 267 | # Initialize Model 268 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 269 | 270 | # Input for article link 271 | article_link = st.text_input("Enter the link to the article:") 272 | 273 | # Input for the question 274 | question = st.text_input("Enter your question:") 275 | 276 | if st.button("Analyze"): 277 | if article_link.strip() == "": 278 | st.error("Please enter a link to the article.") 279 | else: 280 | # Load the article content 281 | loader = WebBaseLoader(article_link) 282 | docs = loader.load() 283 | 284 | # Define the Summarize Chain 285 | template = question + """ Write a concise summary of the following: 286 | "{text}" 287 | CONCISE SUMMARY:""" 288 | 289 | prompt = PromptTemplate.from_template(template) 290 | 291 | llm_chain = LLMChain(llm=llm, prompt=prompt) 292 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 293 | 294 | # Invoke Chain 295 | response = stuff_chain.invoke(docs) 296 | summary = response["output_text"] 297 | 298 | # Display the summary 299 | st.header("Summary:") 300 | st.write(summary) 301 | 302 | def langchain_youtube(): 303 | st.title("Langchain Youtube Video Analysis") 304 | from langchain_google_genai import ChatGoogleGenerativeAI 305 | from langchain_community.document_loaders import YoutubeLoader 306 | from langchain.chains import StuffDocumentsChain 307 | from langchain.chains.llm import LLMChain 308 | from langchain.prompts import PromptTemplate 309 | import asyncio 310 | import nest_asyncio 311 | nest_asyncio.apply() 312 | 313 | # Initialize Model 314 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM") 315 | 316 | # Input for article link 317 | youtube_link = st.text_input("Enter the YouTube link:") 318 | 319 | # Input for the question 320 | question = st.text_input("Enter your question:") 321 | 322 | if st.button("Analyze"): 323 | if youtube_link.strip() == "": 324 | st.error("Please enter a link to the article.") 325 | else: 326 | # Load the article content 327 | loader = YoutubeLoader.from_youtube_url( 328 | youtube_link, 329 | add_video_info=True, 330 | language=["en", "id"], 331 | translation="en", 332 | ) 333 | docs = loader.load() 334 | 335 | # Define the Summarize Chain 336 | template = question + """ Write a concise summary of the following: 337 | "{text}" 338 | CONCISE SUMMARY:""" 339 | 340 | prompt = PromptTemplate.from_template(template) 341 | 342 | llm_chain = LLMChain(llm=llm, prompt=prompt) 343 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 344 | 345 | # Invoke Chain 346 | response = stuff_chain.invoke(docs) 347 | summary = response["output_text"] 348 | 349 | # Display the summary 350 | st.header("Summary:") 351 | st.write(summary) 352 | 353 | # Set CSS to arrange buttons horizontally 354 | st.markdown( 355 | """ 356 | 363 | """, 364 | unsafe_allow_html=True, 365 | ) 366 | 367 | # Get the selected page 368 | selected_page = st.sidebar.radio( 369 | "Select Page", 370 | ("Langchain PDF Text Analysis", 371 | "Langchain Microsoft Word File Analysis", 372 | "Langchain Microsoft Excel File Analysis", 373 | "Langchain Microsoft Power Point File Analysis", 374 | "Langchain CSV File Analysis", 375 | "Langchain Web Content Analysis", 376 | "Langchain Youtube Video Analysis") 377 | ) 378 | 379 | if selected_page == "Langchain PDF Text Analysis": 380 | langchain_pdf() 381 | elif selected_page == "Langchain Microsoft Word File Analysis": 382 | langchain_doc() 383 | elif selected_page == "Langchain Microsoft Excel File Analysis": 384 | langchain_excel() 385 | elif selected_page == "Langchain Microsoft Power Point File Analysis": 386 | langchain_ppt() 387 | elif selected_page == "Langchain CSV File Analysis": 388 | langchain_csv() 389 | elif selected_page == "Langchain Web Content Analysis": 390 | langchain_web() 391 | elif selected_page == "Langchain Youtube Video Analysis": 392 | langchain_youtube() 393 | -------------------------------------------------------------------------------- /Langchain LLM/langchain_youtube.py: -------------------------------------------------------------------------------- 1 | from langchain_google_genai import ChatGoogleGenerativeAI 2 | from langchain_google_genai import GoogleGenerativeAIEmbeddings 3 | from langchain_community.document_loaders import YoutubeLoader 4 | from langchain.chains import StuffDocumentsChain 5 | from langchain.chains.llm import LLMChain 6 | from langchain.prompts import PromptTemplate 7 | import google.generativeai as genai 8 | 9 | #genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 10 | 11 | #Initialize Model 12 | llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 13 | 14 | #Load the blog 15 | loader = YoutubeLoader.from_youtube_url( 16 | "https://www.youtube.com/watch?v=bT8_sZlgOSI", 17 | add_video_info=True, 18 | language=["en", "id"], 19 | translation="en", 20 | ) 21 | docs = loader.load() 22 | 23 | #Define the Summarize Chain 24 | template = """Write a concise summary of the following: 25 | "{text}" 26 | CONCISE SUMMARY:""" 27 | 28 | prompt = PromptTemplate.from_template(template) 29 | 30 | llm_chain = LLMChain(llm=llm, prompt=prompt) 31 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 32 | 33 | #Invoke Chain 34 | response=stuff_chain.invoke(docs) 35 | print(response["output_text"]) -------------------------------------------------------------------------------- /LlamaIndex/llamastreamlit.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from bs4 import BeautifulSoup 3 | from llama_index.core import Document, Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex 4 | from llama_index.readers.web import SimpleWebPageReader 5 | from llama_index.vector_stores.chroma import ChromaVectorStore 6 | from llama_index.embeddings.gemini import GeminiEmbedding 7 | from llama_index.llms.gemini import Gemini 8 | from llama_index.core import PromptTemplate 9 | import chromadb 10 | 11 | # Set up Streamlit page title and instructions 12 | st.title("LlamaIndex + Google Gemini Web Article Question Answering") 13 | st.write("Please input the URL of the webpage you'd like to analyze, and ask your question about it.") 14 | 15 | # Input for the webpage URL 16 | url = st.text_input("Enter URL:") 17 | 18 | # Input for the question 19 | question = st.text_input("Ask your question:") 20 | 21 | # If both URL and question are provided, execute the code 22 | if url and question: 23 | # Load webpage content 24 | web_documents = SimpleWebPageReader().load_data([url]) 25 | html_content = web_documents[0].text 26 | 27 | # Parse HTML content 28 | soup = BeautifulSoup(html_content, 'html.parser') 29 | p_tags = soup.findAll('p') 30 | text_content = "" 31 | for each in p_tags: 32 | text_content += each.text + "\n" 33 | 34 | # Convert to Document format 35 | documents = [Document(text=text_content)] 36 | 37 | # Initialize Gemini embedding model and LLAMA model 38 | gemini_api_key = "AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA" 39 | gemini_embedding_model = GeminiEmbedding(api_key=gemini_api_key, model_name="models/embedding-001") 40 | llm = Gemini(api_key=gemini_api_key, model_name="models/gemini-pro") 41 | 42 | # Create a client and a new collection 43 | client = chromadb.PersistentClient(path="./chroma_db") 44 | chroma_collection = client.get_or_create_collection("quickstart") 45 | 46 | # Create a vector store 47 | vector_store = ChromaVectorStore(chroma_collection=chroma_collection) 48 | 49 | # Create a storage context 50 | storage_context = StorageContext.from_defaults(vector_store=vector_store) 51 | 52 | # Set Global settings 53 | Settings.llm = llm 54 | Settings.embed_model = gemini_embedding_model 55 | 56 | # Create an index from the documents 57 | index = VectorStoreIndex.from_documents(documents, storage_context=storage_context) 58 | 59 | # Define LLAMA prompt template 60 | template = ( 61 | """ You are an assistant for question-answering tasks. 62 | Use the following context to answer the question. 63 | If you don't know the answer, just say that you don't know. 64 | Use five sentences maximum and keep the answer concise.\n 65 | Question: {query_str} \nContext: {context_str} \nAnswer:""" 66 | ) 67 | llm_prompt = PromptTemplate(template) 68 | 69 | # Query data from the persisted index 70 | query_engine = index.as_query_engine(text_qa_template=llm_prompt) 71 | response = query_engine.query(question) 72 | 73 | # Display just the response text 74 | st.write("Answer:", response.response) 75 | 76 | -------------------------------------------------------------------------------- /MySQL/Data Science Salary Query.sql: -------------------------------------------------------------------------------- 1 | /* SELECT DATA WHERE SALARY > 100000 */ 2 | SELECT * FROM new_schema.ds_salaries 3 | WHERE salary > 100000; 4 | 5 | /* SELECT DATA WHERE SALARY > 100000, Company location in US, Order the salary from the largest */ 6 | SELECT MyUnknownColumn, job_title, salary_in_usd, company_location FROM new_schema.ds_salaries 7 | WHERE salary_in_usd > 100000 8 | AND company_location = 'US' 9 | ORDER BY salary_in_usd DESC; 10 | 11 | /* Count the average Average Salary in USD group by job title and sort from the largest */ 12 | SELECT AVG(salary_in_usd) AS AVERAGE_SALARY_IN_USD, job_title FROM new_schema.ds_salaries 13 | GROUP BY job_title 14 | ORDER BY AVERAGE_SALARY_IN_USD DESC; -------------------------------------------------------------------------------- /MySQL/INNER JOIN COMBINATION.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT Z.customer_id, X.email, CONCAT (X.first_name,' ', X.last_name) AS full_name , Z.inventory_id 2 | FROM sakila.rental Z 3 | LEFT JOIN sakila.customer X 4 | ON Z.customer_id = X.customer_id 5 | ORDER BY inventory_id ASC; 6 | 7 | SELECT A.city_id, A.city, B.country 8 | FROM sakila.city A 9 | INNER JOIN sakila.country B 10 | ON A.city_id = B.country_id; 11 | 12 | SELECT A.film_id, A.actor_id, B.category_id 13 | FROM sakila.film_actor A 14 | RIGHT JOIN sakila.film_category B 15 | ON A.film_id = B.category_id; 16 | -------------------------------------------------------------------------------- /MySQL/README.md: -------------------------------------------------------------------------------- 1 | # SQL Portofolio 2 | Here all of my SQL Portofolio created using MYSQL Workbench 3 | -------------------------------------------------------------------------------- /MySQL/SQL JOIN.sql: -------------------------------------------------------------------------------- 1 | SELECT C.city_id, C.city_name, S.country_name 2 | FROM new_schema.`city` C 3 | JOIN new_schema.`country` S 4 | ON C.city_id = S.country_id; 5 | 6 | SELECT C.city_id, C.city_name, S.country_name 7 | FROM new_schema.`city` C 8 | LEFT JOIN new_schema.`country` S 9 | ON C.city_id = S.country_id; 10 | 11 | SELECT C.city_id, C.city_name, S.country_name 12 | FROM new_schema.`city` C 13 | RIGHT JOIN new_schema.`country` S 14 | ON C.city_id = S.country_id; -------------------------------------------------------------------------------- /MySQL/STUDENTS PERFORMANCE.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM new_schema.students_performance_mv; 2 | 3 | /* COUNT RACE ETHNICITY WHERE test preparation course is completed and ORDER BY ASCENDING */ 4 | SELECT race_ethnicity, COUNT(race_ethnicity) AS TOTAL FROM new_schema.students_performance_mv 5 | WHERE test_preparation_course = 'completed' 6 | GROUP BY race_ethnicity 7 | ORDER BY TOTAL; 8 | 9 | /* COUNT THE TOTAL SCORE EACH STUDENT AND THEN RANK THEM FROM HIGHEST*/ 10 | SELECT gender, race_ethnicity, test_preparation_course, math_score + reading_score + writing_score AS TOTAL_SCORE 11 | FROM new_schema.students_performance_mv 12 | ORDER BY TOTAL_SCORE DESC; 13 | 14 | /* COUNT THE AVERAGE SCORE OF 3 TEST THEN COUNT THE AVERAGE AGAIN GROUP BY RACE ETHNICITY THEN ELIMINATE NULL VALUE AND test preparation course is completed */ 15 | SELECT race_ethnicity, (AVG(math_score + reading_score + writing_score)/3) AS NILAI_3_PELAJARAN_RATA_RATA 16 | FROM new_schema.students_performance_mv 17 | WHERE test_preparation_course = 'completed' 18 | AND NOT race_ethnicity ='' 19 | GROUP BY race_ethnicity 20 | ORDER BY NILAI_3_PELAJARAN_RATA_RATA DESC 21 | -------------------------------------------------------------------------------- /MySQL/SUPERSTORE DATA ANALYSIS.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM new_schema.`sample - superstore - wanda.xlsx - orders`; 2 | 3 | /* SELECT AMOUNT OF CUSTOMER EACH REGION */ 4 | SELECT Region, COUNT(Region) AS TOTAL_CUSTOMER FROM new_schema.`sample - superstore - wanda.xlsx - orders` 5 | GROUP BY Region 6 | ORDER BY TOTAL_CUSTOMER DESC; 7 | 8 | /* COUNT THE QUANTITY EACH REGION */ 9 | SELECT Region, SUM(Quantity) AS TOTAL_QUANTITY FROM new_schema.`sample - superstore - wanda.xlsx - orders` 10 | GROUP BY Region 11 | ORDER BY TOTAL_QUANTITY DESC; 12 | 13 | /* COUNT SALES EACH REGION */ 14 | SELECT Region, ROUND(SUM(Sales),2) AS TOTAL_SALES FROM new_schema.`sample - superstore - wanda.xlsx - orders` 15 | GROUP BY Region 16 | ORDER BY TOTAL_SALES DESC; 17 | 18 | /* FIRST BUY EACH REGION */ 19 | SELECT Region, MIN(Order_Date) AS FIRST_BUYER_DATE FROM new_schema.`sample - superstore - wanda.xlsx - orders` 20 | GROUP BY Region 21 | ORDER BY FIRST_BUYER_DATE; -------------------------------------------------------------------------------- /MySQL/VIRTUAL INTERNSHIP QUERIES.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT Z.SK_ID_CURR, X.SK_ID_CURR, Z.CODE_GENDER, X.NAME_CONTRACT_STATUS 2 | FROM vix.hci_application AS Z 3 | INNER JOIN vix.hci_previous AS X 4 | ON Z.SK_ID_CURR = X.SK_ID_CURR 5 | WHERE Z.TARGET = 0 6 | AND Z.CODE_GENDER = 'F' 7 | AND NOT Z.CNT_CHILDREN = 0 8 | AND X.NAME_CONTRACT_STATUS = 'Approved' 9 | ORDER BY X.SK_ID_CURR; 10 | 11 | SELECT DISTINCT Z.SK_ID_CURR, X.SK_ID_CURR, Z.CODE_GENDER, Z.CNT_CHILDREN, Z.TARGET, X.NAME_CONTRACT_TYPE, COUNT(*) AS TOTAL 12 | FROM vix.hci_application AS Z 13 | INNER JOIN vix.hci_previous AS X 14 | ON Z.SK_ID_CURR = X.SK_ID_CURR 15 | GROUP BY X.NAME_CONTRACT_TYPE 16 | /*HAVING Z.CODE_GENDER = 'F' 17 | AND Z.TARGET = 0 18 | AND NOT Z.CNT_CHILDREN = 0*/ 19 | ORDER BY TOTAL DESC; 20 | 21 | SELECT DISTINCT Z.SK_ID_CURR, X.SK_ID_CURR, Z.CODE_GENDER, X.NAME_CONTRACT_STATUS 22 | FROM vix.hci_application AS Z 23 | INNER JOIN vix.hci_previous AS X 24 | ON Z.SK_ID_CURR = X.SK_ID_CURR 25 | WHERE Z.TARGET = 0 26 | AND Z.CODE_GENDER = 'F' 27 | AND NOT Z.CNT_CHILDREN = 0 28 | AND X.NAME_CONTRACT_STATUS = 'Approved'; 29 | -------------------------------------------------------------------------------- /MySQL/exam score analysis.sql: -------------------------------------------------------------------------------- 1 | SELECT gender, race_ethnicity, math_score, reading_score, writing_score, (math_score + reading_score + writing_score) AS total 2 | FROM exam.exams 3 | HAVING total > 200 4 | ORDER BY total DESC; 5 | 6 | SELECT race_ethnicity, AVG(math_score + reading_score + writing_score) AS AVERAGE 7 | FROM exam.exams 8 | GROUP BY race_ethnicity 9 | ORDER BY AVERAGE DESC; 10 | 11 | SELECT race_ethnicity, ROUND(AVG((math_score + reading_score + writing_score)/3),2) AS AVERAGE_SCORE 12 | FROM exam.exams 13 | GROUP BY race_ethnicity 14 | ORDER BY AVERAGE_SCORE DESC; 15 | 16 | SELECT race_ethnicity, ROUND(AVG(math_score),2) AS AVERAGE_MATH, ROUND(AVG(reading_score),2) AS AVERAGE_READING, ROUND(AVG(writing_score),2) AS AVERAGE_WRITING 17 | FROM exam.exams 18 | WHERE math_score > 70 19 | AND reading_score > 70 20 | AND writing_score > 70 21 | GROUP BY race_ethnicity; 22 | 23 | -------------------------------------------------------------------------------- /MySQL/excercise 1.sql: -------------------------------------------------------------------------------- 1 | /* no 1 */ 2 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k 3 | WHERE trip_distance < 3 4 | AND payment_type = 3; 5 | 6 | /* no 2 */ 7 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k 8 | WHERE trip_distance < 3; 9 | 10 | /* no 3 */ 11 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k 12 | WHERE trip_distance < 3 13 | AND passenger_count = 1; 14 | 15 | /* no 4 */ 16 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k 17 | WHERE trip_distance 18 | BETWEEN 1.50 AND 1.60; 19 | -------------------------------------------------------------------------------- /MySQL/sakila-dvd-rental.sql: -------------------------------------------------------------------------------- 1 | /* CATEGORIZE FILM WITH ACTOR NAME AND FILM CATEGORY */ 2 | SELECT Z.actor_id, 3 | CONCAT(Z.first_name," ",Z.last_name) AS actor_name, 4 | X.film_id, C.title AS film_title, 5 | B.name AS category 6 | FROM sakila.actor Z 7 | INNER JOIN sakila.film_actor X 8 | ON Z.actor_id = X.actor_id 9 | INNER JOIN sakila.film_text C 10 | ON X.film_id = C.film_id 11 | INNER JOIN sakila.film_category V 12 | ON C.film_id = V.film_id 13 | INNER JOIN sakila.category B 14 | ON V.category_id = B.category_id 15 | WHERE B.name = 'Action'; 16 | 17 | /* CUSTOMER PAYMENT DATA WITH PRICE AND FILM TITLE */ 18 | SELECT CONCAT(Z.first_name," ",Z.last_name) AS customer_name, 19 | X.amount, X.payment_date, 20 | C.inventory_id, C.rental_id, 21 | V.film_id, B.title 22 | FROM sakila.customer Z 23 | INNER JOIN sakila.payment X 24 | ON Z.customer_id = X.customer_id 25 | INNER JOIN sakila.rental C 26 | ON X.customer_id = C.customer_id 27 | INNER JOIN sakila.inventory V 28 | ON C.inventory_id = V.inventory_id 29 | INNER JOIN sakila.film_text B 30 | ON V.film_id = B.film_id; 31 | 32 | /* CUSTOMER ADDRESS AND IDENTITY */ 33 | SELECT CONCAT(Z.first_name, " ", Z.last_name) AS name, 34 | Z.email, Z.address_id, 35 | X.address 36 | FROM sakila.customer Z 37 | INNER JOIN sakila.address X 38 | ON Z.address_id = X.address_id -------------------------------------------------------------------------------- /Natural Language Processing/emotion_streamlit.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | import re 6 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 7 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 8 | import collections 9 | import pathlib 10 | import textwrap 11 | import google.generativeai as genai 12 | from IPython.display import display 13 | from IPython.display import Markdown 14 | 15 | # Create stemmer 16 | factory = StemmerFactory() 17 | stemmer = factory.create_stemmer() 18 | 19 | # Create stopword remover 20 | stop_factory = StopWordRemoverFactory() 21 | more_stopword = ['dengan', 'ia', 'bahwa', 'oleh', 'rp', 'undang', 'pasal', 'ayat', 'bab'] 22 | data = stop_factory.get_stop_words() + more_stopword 23 | 24 | # Define patterns for removal 25 | hyperlink_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') 26 | number_pattern = re.compile(r'\b\d+\b') 27 | emoticon_pattern = re.compile(u'(' 28 | u'\ud83c[\udf00-\udfff]|' 29 | u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' 30 | u'[\u2600-\u26FF\u2700-\u27BF])+', 31 | re.UNICODE) 32 | 33 | st.title('Sentiment Analysis') 34 | 35 | uploaded_file = st.file_uploader("Upload a CSV file", type="csv") 36 | custom_stopwords = st.text_input('Custom Stopwords (comma-separated)', '') 37 | 38 | if uploaded_file is not None and custom_stopwords: 39 | if st.button('Analyze'): 40 | df = pd.read_csv(uploaded_file) 41 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(',')] 42 | all_stopwords = data + custom_stopword_list 43 | 44 | df['cleaned_text'] = df['full_text'].str.replace(hyperlink_pattern, '') 45 | df['cleaned_text'] = df['cleaned_text'].str.replace(emoticon_pattern, '') 46 | df['cleaned_text'] = df['cleaned_text'].str.replace(number_pattern, '') 47 | 48 | for stopword in custom_stopword_list: 49 | df['cleaned_text'] = df['cleaned_text'].str.replace(stopword, '') 50 | 51 | df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join( 52 | [stemmer.stem(word) for word in stop_factory.create_stop_word_remover().remove(x).split() 53 | if word.lower() not in all_stopwords] 54 | )) 55 | 56 | from transformers import BertForSequenceClassification, BertTokenizer, BertConfig 57 | 58 | tokenizer = BertTokenizer.from_pretrained("indobert-emotion-classification") 59 | config = BertConfig.from_pretrained("indobert-emotion-classification") 60 | model = BertForSequenceClassification.from_pretrained("indobert-emotion-classification", config=config) 61 | from transformers import pipeline 62 | 63 | nlp = pipeline("text-classification", model="indobert-emotion-classification") 64 | results = df['cleaned_text'].apply(lambda x: nlp(x)[0]) 65 | df['label'] = [res['label'] for res in results] 66 | df['score'] = [res['score'] for res in results] 67 | 68 | sentiment_counts = df['label'].value_counts() 69 | 70 | st.write("### Sentiment Distribution") 71 | st.bar_chart(sentiment_counts) 72 | 73 | st.write("### Analysis Results") 74 | st.write(df) 75 | 76 | anger_text = ' '.join(df[df['label'] == 'Anger']['cleaned_text']) 77 | happy_text = ' '.join(df[df['label'] == 'Happy']['cleaned_text']) 78 | neutral_text = ' '.join(df[df['label'] == 'Neutral']['cleaned_text']) 79 | fear_text = ' '.join(df[df['label'] == 'Fear']['cleaned_text']) 80 | sadness_text = ' '.join(df[df['label'] == 'Sadness']['cleaned_text']) 81 | love_text = ' '.join(df[df['label'] == 'Love']['cleaned_text']) 82 | 83 | # Bigrams Anger Sentiment 84 | words1 = anger_text.split() 85 | # Get bigrams 86 | bigrams = list(zip(words1, words1[1:])) 87 | 88 | # Count bigrams 89 | bigram_counts = collections.Counter(bigrams) 90 | 91 | # Get top 10 bigram counts 92 | top_bigrams = dict(bigram_counts.most_common(10)) 93 | 94 | # Create bar chart 95 | plt.figure(figsize=(10, 7)) 96 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 97 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 98 | plt.xlabel('Bigram Words') 99 | plt.ylabel('Count') 100 | plt.title(f"Top 10 Bigram for Anger Sentiment") 101 | # Save the entire plot as a PNG 102 | plt.tight_layout() 103 | plt.savefig("bigram_anger.png") 104 | st.subheader("Bigram for Anger Sentiment") 105 | st.image("bigram_anger.png") 106 | 107 | def to_markdown(text): 108 | text = text.replace('•', ' *') 109 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 110 | 111 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 112 | 113 | import PIL.Image 114 | 115 | img = PIL.Image.open("bigram_anger.png") 116 | model = genai.GenerativeModel('gemini-pro-vision') 117 | response = model.generate_content(img) 118 | 119 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img]) 120 | response.resolve() 121 | st.write("**Google Gemini Response About Data**") 122 | st.write(response.text) 123 | 124 | 125 | 126 | 127 | # Bigrams Happy Sentiment 128 | words1 = happy_text.split() 129 | # Get bigrams 130 | bigrams = list(zip(words1, words1[1:])) 131 | 132 | # Count bigrams 133 | bigram_counts = collections.Counter(bigrams) 134 | 135 | # Get top 10 bigram counts 136 | top_bigrams = dict(bigram_counts.most_common(10)) 137 | 138 | # Create bar chart 139 | plt.figure(figsize=(10, 7)) 140 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 141 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 142 | plt.xlabel('Bigram Words') 143 | plt.ylabel('Count') 144 | plt.title(f"Top 10 Bigram for Happy Sentiment") 145 | # Save the entire plot as a PNG 146 | plt.tight_layout() 147 | plt.savefig("bigram_happy.png") 148 | st.subheader("Bigram for Happy Sentiment") 149 | st.image("bigram_happy.png") 150 | 151 | def to_markdown(text): 152 | text = text.replace('•', ' *') 153 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 154 | 155 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 156 | 157 | import PIL.Image 158 | 159 | img = PIL.Image.open("bigram_happy.png") 160 | model = genai.GenerativeModel('gemini-pro-vision') 161 | response = model.generate_content(img) 162 | 163 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img]) 164 | response.resolve() 165 | st.write("**Google Gemini Response About Data**") 166 | st.write(response.text) 167 | 168 | 169 | 170 | 171 | # Bigrams Neutral Sentiment 172 | words1 = neutral_text.split() 173 | # Get bigrams 174 | bigrams = list(zip(words1, words1[1:])) 175 | 176 | # Count bigrams 177 | bigram_counts = collections.Counter(bigrams) 178 | 179 | # Get top 10 bigram counts 180 | top_bigrams = dict(bigram_counts.most_common(10)) 181 | 182 | # Create bar chart 183 | plt.figure(figsize=(10, 7)) 184 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 185 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 186 | plt.xlabel('Bigram Words') 187 | plt.ylabel('Count') 188 | plt.title(f"Top 10 Bigram for Neutral Sentiment") 189 | # Save the entire plot as a PNG 190 | plt.tight_layout() 191 | plt.savefig("bigram_neutral.png") 192 | st.subheader("Bigram for Neutral Sentiment") 193 | st.image("bigram_neutral.png") 194 | 195 | def to_markdown(text): 196 | text = text.replace('•', ' *') 197 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 198 | 199 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 200 | 201 | import PIL.Image 202 | 203 | img = PIL.Image.open("bigram_neutral.png") 204 | model = genai.GenerativeModel('gemini-pro-vision') 205 | response = model.generate_content(img) 206 | 207 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img]) 208 | response.resolve() 209 | st.write("**Google Gemini Response About Data**") 210 | st.write(response.text) 211 | 212 | 213 | 214 | 215 | # Bigrams Fear Sentiment 216 | words1 = fear_text.split() 217 | # Get bigrams 218 | bigrams = list(zip(words1, words1[1:])) 219 | 220 | # Count bigrams 221 | bigram_counts = collections.Counter(bigrams) 222 | 223 | # Get top 10 bigram counts 224 | top_bigrams = dict(bigram_counts.most_common(10)) 225 | 226 | # Create bar chart 227 | plt.figure(figsize=(10, 7)) 228 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 229 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 230 | plt.xlabel('Bigram Words') 231 | plt.ylabel('Count') 232 | plt.title(f"Top 10 Bigram for Fear Sentiment") 233 | # Save the entire plot as a PNG 234 | plt.tight_layout() 235 | plt.savefig("bigram_fear.png") 236 | st.subheader("Bigram for Fear Sentiment") 237 | st.image("bigram_fear.png") 238 | 239 | def to_markdown(text): 240 | text = text.replace('•', ' *') 241 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 242 | 243 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 244 | 245 | import PIL.Image 246 | 247 | img = PIL.Image.open("bigram_fear.png") 248 | model = genai.GenerativeModel('gemini-pro-vision') 249 | response = model.generate_content(img) 250 | 251 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img]) 252 | response.resolve() 253 | st.write("**Google Gemini Response About Data**") 254 | st.write(response.text) 255 | 256 | 257 | 258 | 259 | # Bigrams Sadness Sentiment 260 | words1 = sadness_text.split() 261 | # Get bigrams 262 | bigrams = list(zip(words1, words1[1:])) 263 | 264 | # Count bigrams 265 | bigram_counts = collections.Counter(bigrams) 266 | 267 | # Get top 10 bigram counts 268 | top_bigrams = dict(bigram_counts.most_common(10)) 269 | 270 | # Create bar chart 271 | plt.figure(figsize=(10, 7)) 272 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 273 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 274 | plt.xlabel('Bigram Words') 275 | plt.ylabel('Count') 276 | plt.title(f"Top 10 Bigram for Sadness Sentiment") 277 | # Save the entire plot as a PNG 278 | plt.tight_layout() 279 | plt.savefig("bigram_sadness.png") 280 | st.subheader("Bigram for Sadness Sentiment") 281 | st.image("bigram_sadness.png") 282 | 283 | def to_markdown(text): 284 | text = text.replace('•', ' *') 285 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 286 | 287 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 288 | 289 | import PIL.Image 290 | 291 | img = PIL.Image.open("bigram_sadness.png") 292 | model = genai.GenerativeModel('gemini-pro-vision') 293 | response = model.generate_content(img) 294 | 295 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img]) 296 | response.resolve() 297 | st.write("**Google Gemini Response About Data**") 298 | st.write(response.text) 299 | 300 | 301 | 302 | 303 | # Bigrams Love Sentiment 304 | words1 = love_text.split() 305 | # Get bigrams 306 | bigrams = list(zip(words1, words1[1:])) 307 | 308 | # Count bigrams 309 | bigram_counts = collections.Counter(bigrams) 310 | 311 | # Get top 10 bigram counts 312 | top_bigrams = dict(bigram_counts.most_common(10)) 313 | 314 | # Create bar chart 315 | plt.figure(figsize=(10, 7)) 316 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 317 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 318 | plt.xlabel('Bigram Words') 319 | plt.ylabel('Count') 320 | plt.title(f"Top 10 Bigram for Love Sentiment") 321 | # Save the entire plot as a PNG 322 | plt.tight_layout() 323 | plt.savefig("bigram_love.png") 324 | st.subheader("Bigram for Love Sentiment") 325 | st.image("bigram_love.png") 326 | 327 | def to_markdown(text): 328 | text = text.replace('•', ' *') 329 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 330 | 331 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc") 332 | 333 | import PIL.Image 334 | 335 | img = PIL.Image.open("bigram_love.png") 336 | model = genai.GenerativeModel('gemini-pro-vision') 337 | response = model.generate_content(img) 338 | 339 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img]) 340 | response.resolve() 341 | st.write("**Google Gemini Response About Data**") 342 | st.write(response.text) -------------------------------------------------------------------------------- /Power BI/pbi1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Power BI/pbi1.JPG -------------------------------------------------------------------------------- /Power BI/pbi2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Power BI/pbi2.JPG -------------------------------------------------------------------------------- /R Language/calculate.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | library(forcats) 4 | library(quantmod) 5 | library(zoo) 6 | library(plotly) 7 | 8 | setwd("D:/RStudio/dataset") 9 | data3 <- read.csv("portfolio_data.csv") 10 | 11 | # Convert the "Date" column to the desired format "2013-05-01" 12 | data3$Date <- as.Date(data3$Date, format = "%m/%d/%Y") 13 | 14 | # Convert the data to an xts object using only the numeric columns 15 | prices_xts <- xts(data3[, -1], order.by = data3$Date) 16 | 17 | # Calculate the returns for each asset 18 | returns_xts <- Return.calculate(prices_xts) 19 | 20 | # Convert the "Date" column to the desired format "2013-05-01" 21 | data3$Date <- as.Date(data3$Date, format = "%m/%d/%Y") 22 | 23 | # Convert the data to an xts object using only the numeric columns 24 | prices_xts <- xts(data3[, -1], order.by = data3$Date) 25 | 26 | # Calculate the returns for each asset 27 | returns_xts <- Return.calculate(prices_xts) 28 | 29 | # Convert the returns back to a data frame 30 | returns_df <- data.frame(Date = index(returns_xts), coredata(returns_xts)) 31 | 32 | # Create an interactive line chart for each asset's returns 33 | plot_ly(data = returns_df, x = ~Date) %>% 34 | add_lines(y = ~AMZN, name = "AMZN", line = list(color = "blue")) %>% 35 | add_lines(y = ~DPZ, name = "DPZ", line = list(color = "green")) %>% 36 | add_lines(y = ~BTC, name = "BTC", line = list(color = "orange")) %>% 37 | add_lines(y = ~NFLX, name = "NFLX", line = list(color = "red")) %>% 38 | layout(title = "Asset Returns Over Time", 39 | xaxis = list(title = "Date"), 40 | yaxis = list(title = "Returns"), 41 | showlegend = TRUE) 42 | -------------------------------------------------------------------------------- /R Language/coba.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | library(forcats) 4 | 5 | setwd("D:/RStudio/dataset") 6 | data <- read.csv("ruu_sql2.csv") 7 | 8 | # Create the countplot 9 | ggplot(data, aes(x = fct_infreq(sponsor))) + 10 | geom_bar(stat = "count") 11 | 12 | # Customized countplot 13 | ggplot(data, aes(x = fct_infreq(sponsor), fill = sponsor)) + 14 | geom_bar() + 15 | labs(title = "Countplot of Sponsor", 16 | x = "Sponsor", 17 | y = "Count") + 18 | theme_minimal() + 19 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 20 | 21 | 22 | -------------------------------------------------------------------------------- /R Language/portfolio.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | library(forcats) 4 | library(quantmod) 5 | library(zoo) 6 | 7 | setwd("D:/RStudio/dataset") 8 | data1 <- read.csv("portfolio_data.csv") 9 | 10 | # Convert the "Date" column to the desired format "2013-05-01" 11 | data1$Date <- as.Date(data1$Date, format = "%m/%d/%Y") 12 | 13 | returns1 <- Return.portfolio(data1) 14 | 15 | 16 | # Convert the xts object to a dataframe and extract the Date column 17 | returns_df <- data.frame(Date = index(returns1), portfolio.returns = coredata(returns1)) 18 | 19 | # Create a line chart 20 | ggplot(data = returns_df, aes(x = Date, y = portfolio.returns)) + 21 | geom_line() + 22 | labs(title = "Portfolio Returns Over Time", 23 | x = "Date", 24 | y = "Portfolio Returns") 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # All-of-Data-Science-Project 2 | This repository is for all of my Data Science Project and Portfolio 3 | -------------------------------------------------------------------------------- /Snowflake Cloud/README.md: -------------------------------------------------------------------------------- 1 | # Snowflake Project 2 | This repository is for all of my Snowflake Cloud that created by myself. Use this code for your reference only, modify this code if you want to use the real time poject 3 | 4 | ## 1. Snowflake Python Connector 5 | This project is about connecting the Database from Snowflake into Jupyter Notebook with Snowflake Python connector. Then we can analyze the data using Python 6 | 7 | ## 2. Snowflake Snowpark Session 8 | This project is about to process data directly within the Snowflake cloud platform, allows the user to build data pipelines and applications for Snowflake in Python, Scala, or Java, allows the user to simplify data preprocessing tasks in Snowflake using familiar programming languages. 9 | -------------------------------------------------------------------------------- /Snowflake Cloud/Snowflake_Python_Connector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "# Install Required Library" 21 | ], 22 | "metadata": { 23 | "id": "Q5-cg7fitgua" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "colab": { 31 | "base_uri": "https://localhost:8080/" 32 | }, 33 | "id": "vvcfAobHpIau", 34 | "outputId": "2bf0572d-1fc5-46de-e1e2-6ff217deb515" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Collecting snowflake-connector-python\n", 42 | " Downloading snowflake_connector_python-3.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)\n", 43 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 44 | "\u001b[?25hCollecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)\n", 45 | " Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n", 46 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.0/105.0 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 47 | "\u001b[?25hRequirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (1.16.0)\n", 48 | "Requirement already satisfied: cryptography<43.0.0,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (42.0.5)\n", 49 | "Requirement already satisfied: pyOpenSSL<25.0.0,>=16.2.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (24.1.0)\n", 50 | "Requirement already satisfied: pyjwt<3.0.0 in /usr/lib/python3/dist-packages (from snowflake-connector-python) (2.3.0)\n", 51 | "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2023.4)\n", 52 | "Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2.31.0)\n", 53 | "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (24.0)\n", 54 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (3.3.2)\n", 55 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (3.6)\n", 56 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2024.2.2)\n", 57 | "Requirement already satisfied: typing-extensions<5,>=4.3 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (4.11.0)\n", 58 | "Requirement already satisfied: filelock<4,>=3.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (3.13.4)\n", 59 | "Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2.4.0)\n", 60 | "Requirement already satisfied: platformdirs<5.0.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (4.2.0)\n", 61 | "Collecting tomlkit (from snowflake-connector-python)\n", 62 | " Downloading tomlkit-0.12.4-py3-none-any.whl (37 kB)\n", 63 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python) (2.22)\n", 64 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0->snowflake-connector-python) (2.0.7)\n", 65 | "Installing collected packages: asn1crypto, tomlkit, snowflake-connector-python\n", 66 | "Successfully installed asn1crypto-1.5.1 snowflake-connector-python-3.8.1 tomlkit-0.12.4\n", 67 | "Collecting snowflake-sqlalchemy\n", 68 | " Downloading snowflake_sqlalchemy-1.5.2-py3-none-any.whl (42 kB)\n", 69 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 70 | "\u001b[?25hRequirement already satisfied: snowflake-connector-python in /usr/local/lib/python3.10/dist-packages (from snowflake-sqlalchemy) (3.8.1)\n", 71 | "Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (from snowflake-sqlalchemy) (2.0.29)\n", 72 | "Requirement already satisfied: asn1crypto<2.0.0,>0.24.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (1.5.1)\n", 73 | "Requirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (1.16.0)\n", 74 | "Requirement already satisfied: cryptography<43.0.0,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (42.0.5)\n", 75 | "Requirement already satisfied: pyOpenSSL<25.0.0,>=16.2.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (24.1.0)\n", 76 | "Requirement already satisfied: pyjwt<3.0.0 in /usr/lib/python3/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2.3.0)\n", 77 | "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2023.4)\n", 78 | "Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2.31.0)\n", 79 | "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (24.0)\n", 80 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (3.3.2)\n", 81 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (3.6)\n", 82 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2024.2.2)\n", 83 | "Requirement already satisfied: typing-extensions<5,>=4.3 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (4.11.0)\n", 84 | "Requirement already satisfied: filelock<4,>=3.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (3.13.4)\n", 85 | "Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2.4.0)\n", 86 | "Requirement already satisfied: platformdirs<5.0.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (4.2.0)\n", 87 | "Requirement already satisfied: tomlkit in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (0.12.4)\n", 88 | "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy->snowflake-sqlalchemy) (3.0.3)\n", 89 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python->snowflake-sqlalchemy) (2.22)\n", 90 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0->snowflake-connector-python->snowflake-sqlalchemy) (2.0.7)\n", 91 | "Installing collected packages: snowflake-sqlalchemy\n", 92 | "Successfully installed snowflake-sqlalchemy-1.5.2\n", 93 | "Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (2.0.29)\n", 94 | "Requirement already satisfied: typing-extensions>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (4.11.0)\n", 95 | "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (3.0.3)\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "!pip install snowflake-connector-python\n", 101 | "!pip install snowflake-sqlalchemy\n", 102 | "!pip install sqlalchemy" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "source": [ 108 | "# Make a Connection into your Snowflake account" 109 | ], 110 | "metadata": { 111 | "id": "HsKtW8CatmJl" 112 | } 113 | }, 114 | { 115 | "cell_type": "code", 116 | "source": [ 117 | "import snowflake.connector\n", 118 | "from sqlalchemy import create_engine" 119 | ], 120 | "metadata": { 121 | "id": "NM4SW0NspLPo" 122 | }, 123 | "execution_count": null, 124 | "outputs": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "source": [ 129 | "# Snowflake connection parameters\n", 130 | "username = 'MAGICDASH91'\n", 131 | "password = '*************'\n", 132 | "account = 'tk11073.europe-west4.gcp'\n", 133 | "warehouse = 'COMPUTE_WH'\n", 134 | "database = 'DATABASE'\n", 135 | "schema = 'PUBLIC'\n", 136 | "\n", 137 | "# Establishing connection\n", 138 | "conn = snowflake.connector.connect(\n", 139 | " user=username,\n", 140 | " password=password,\n", 141 | " account=account,\n", 142 | " warehouse=warehouse,\n", 143 | " database=database,\n", 144 | " schema=schema\n", 145 | ")\n", 146 | "\n", 147 | "# Creating a cursor object\n", 148 | "cur = conn.cursor()" 149 | ], 150 | "metadata": { 151 | "id": "xFqBAYWWpNON" 152 | }, 153 | "execution_count": null, 154 | "outputs": [] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "source": [ 159 | "import pandas as pd\n", 160 | "\n", 161 | "# Executing a modified query\n", 162 | "cur.execute(\"\"\"\n", 163 | " SELECT\n", 164 | " NAME,\n", 165 | " YEAR,\n", 166 | " SELLING_PRICE,\n", 167 | " CASE\n", 168 | " WHEN SELLING_PRICE < 100000 THEN 'CHEAP'\n", 169 | " WHEN SELLING_PRICE >= 100000 AND SELLING_PRICE <= 200000 THEN 'NORMAL'\n", 170 | " ELSE 'EXPENSIVE'\n", 171 | " END AS SELLING_PRICE_LABEL\n", 172 | " FROM\n", 173 | " BIKE_DETAILS\n", 174 | "\"\"\")\n", 175 | "\n", 176 | "# Fetching results\n", 177 | "rows = cur.fetchall()\n", 178 | "\n", 179 | "# Creating a Pandas DataFrame\n", 180 | "df = pd.DataFrame(rows, columns=['Name', 'Year', 'Selling_Price', 'Selling_Price_Label'])\n", 181 | "\n", 182 | "# Displaying the DataFrame\n", 183 | "df" 184 | ], 185 | "metadata": { 186 | "colab": { 187 | "base_uri": "https://localhost:8080/", 188 | "height": 424 189 | }, 190 | "id": "kAL7Zo4-tA1w", 191 | "outputId": "2f46ddc6-e33f-441d-9822-5960ef26bf27" 192 | }, 193 | "execution_count": null, 194 | "outputs": [ 195 | { 196 | "output_type": "execute_result", 197 | "data": { 198 | "text/plain": [ 199 | " Name Year Selling_Price \\\n", 200 | "0 Royal Enfield Classic 350 2019 175000 \n", 201 | "1 Honda Dio 2017 45000 \n", 202 | "2 Royal Enfield Classic Gunmetal Grey 2018 150000 \n", 203 | "3 Yamaha Fazer FI V 2.0 [2016-2018] 2015 65000 \n", 204 | "4 Yamaha SZ [2013-2014] 2011 20000 \n", 205 | "... ... ... ... \n", 206 | "1056 Activa 3g 2010 17000 \n", 207 | "1057 Honda CB twister 2012 16000 \n", 208 | "1058 Bajaj Discover 125 2013 15000 \n", 209 | "1059 Honda CB Shine 2009 12000 \n", 210 | "1060 Bajaj Pulsar 150 2008 10000 \n", 211 | "\n", 212 | " Selling_Price_Label \n", 213 | "0 NORMAL \n", 214 | "1 CHEAP \n", 215 | "2 NORMAL \n", 216 | "3 CHEAP \n", 217 | "4 CHEAP \n", 218 | "... ... \n", 219 | "1056 CHEAP \n", 220 | "1057 CHEAP \n", 221 | "1058 CHEAP \n", 222 | "1059 CHEAP \n", 223 | "1060 CHEAP \n", 224 | "\n", 225 | "[1061 rows x 4 columns]" 226 | ], 227 | "text/html": [ 228 | "\n", 229 | "
\n", 230 | "
\n", 231 | "\n", 244 | "\n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | "
NameYearSelling_PriceSelling_Price_Label
0Royal Enfield Classic 3502019175000NORMAL
1Honda Dio201745000CHEAP
2Royal Enfield Classic Gunmetal Grey2018150000NORMAL
3Yamaha Fazer FI V 2.0 [2016-2018]201565000CHEAP
4Yamaha SZ [2013-2014]201120000CHEAP
...............
1056Activa 3g201017000CHEAP
1057Honda CB twister201216000CHEAP
1058Bajaj Discover 125201315000CHEAP
1059Honda CB Shine200912000CHEAP
1060Bajaj Pulsar 150200810000CHEAP
\n", 334 | "

1061 rows × 4 columns

\n", 335 | "
\n", 336 | "
\n", 337 | "\n", 338 | "
\n", 339 | " \n", 347 | "\n", 348 | " \n", 388 | "\n", 389 | " \n", 413 | "
\n", 414 | "\n", 415 | "\n", 416 | "
\n", 417 | " \n", 428 | "\n", 429 | "\n", 518 | "\n", 519 | " \n", 541 | "
\n", 542 | "
\n", 543 | "
\n" 544 | ], 545 | "application/vnd.google.colaboratory.intrinsic+json": { 546 | "type": "dataframe", 547 | "variable_name": "df", 548 | "summary": "{\n \"name\": \"df\",\n \"rows\": 1061,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 276,\n \"samples\": [\n \"Hero Xtreme Sports\",\n \"Bajaj Avenger [2015]\",\n \"Bajaj Avenger Street 160\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Year\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4,\n \"min\": 1988,\n \"max\": 2020,\n \"num_unique_values\": 28,\n \"samples\": [\n 2012,\n 2003,\n 2020\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Selling_Price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 56304,\n \"min\": 5000,\n \"max\": 760000,\n \"num_unique_values\": 130,\n \"samples\": [\n 72000,\n 160000,\n 26000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Selling_Price_Label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"NORMAL\",\n \"CHEAP\",\n \"EXPENSIVE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" 549 | } 550 | }, 551 | "metadata": {}, 552 | "execution_count": 4 553 | } 554 | ] 555 | } 556 | ] 557 | } 558 | -------------------------------------------------------------------------------- /Snowflake Cloud/Snowflake_Snowpark_Session.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "# Install Required Library" 21 | ], 22 | "metadata": { 23 | "id": "xe3ioIeBOEVv" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "colab": { 31 | "base_uri": "https://localhost:8080/" 32 | }, 33 | "id": "nUI_Fe6nLjC-", 34 | "outputId": "a9217013-5cd8-48f6-b4dd-2217f94882f3" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Requirement already satisfied: snowflake-snowpark-python in /usr/local/lib/python3.10/dist-packages (1.14.0)\n", 42 | "Requirement already satisfied: setuptools>=40.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (67.7.2)\n", 43 | "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (0.43.0)\n", 44 | "Requirement already satisfied: snowflake-connector-python<4.0.0,>=3.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (3.8.1)\n", 45 | "Requirement already satisfied: typing-extensions<5.0.0,>=4.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (4.11.0)\n", 46 | "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (6.0.1)\n", 47 | "Requirement already satisfied: cloudpickle!=2.1.0,!=2.2.0,<=2.2.1,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (2.2.1)\n", 48 | "Requirement already satisfied: asn1crypto<2.0.0,>0.24.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (1.5.1)\n", 49 | "Requirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (1.16.0)\n", 50 | "Requirement already satisfied: cryptography<43.0.0,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (42.0.5)\n", 51 | "Requirement already satisfied: pyOpenSSL<25.0.0,>=16.2.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (24.1.0)\n", 52 | "Requirement already satisfied: pyjwt<3.0.0 in /usr/lib/python3/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.3.0)\n", 53 | "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2023.4)\n", 54 | "Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.31.0)\n", 55 | "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (24.0)\n", 56 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (3.3.2)\n", 57 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (3.6)\n", 58 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2024.2.2)\n", 59 | "Requirement already satisfied: filelock<4,>=3.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (3.13.4)\n", 60 | "Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.4.0)\n", 61 | "Requirement already satisfied: platformdirs<5.0.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (4.2.0)\n", 62 | "Requirement already satisfied: tomlkit in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (0.12.4)\n", 63 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.22)\n", 64 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0->snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.0.7)\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "pip install snowflake-snowpark-python" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "source": [ 75 | "# Create Snowpark Session" 76 | ], 77 | "metadata": { 78 | "id": "P7tMNrSvToO9" 79 | } 80 | }, 81 | { 82 | "cell_type": "code", 83 | "source": [ 84 | "from snowflake.snowpark.session import Session\n", 85 | "\n", 86 | "username = 'MAGICDASH91'\n", 87 | "password = '*************'\n", 88 | "account = 'tk11073.europe-west4.gcp'\n", 89 | "warehouse = 'COMPUTE_WH'\n", 90 | "database = 'DATABASE'\n", 91 | "schema = 'PUBLIC'\n", 92 | "\n", 93 | "def snowpark_session_create():\n", 94 | " connection_params = {\n", 95 | " \"user\": username,\n", 96 | " \"password\": password,\n", 97 | " \"account\": account,\n", 98 | " \"warehouse\": warehouse,\n", 99 | " \"database\": database,\n", 100 | " \"schema\": schema\n", 101 | " }\n", 102 | "\n", 103 | " # Create the session\n", 104 | " session = Session.builder.configs(connection_params).create()\n", 105 | " return session\n", 106 | "\n", 107 | "demo_session = snowpark_session_create()" 108 | ], 109 | "metadata": { 110 | "id": "_UiOBJ79Mhmb" 111 | }, 112 | "execution_count": null, 113 | "outputs": [] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "source": [ 118 | "# Start Querying your data" 119 | ], 120 | "metadata": { 121 | "id": "F961ofKuT0h9" 122 | } 123 | }, 124 | { 125 | "cell_type": "code", 126 | "source": [ 127 | "df = demo_session.sql('SELECT * FROM CROSS_SELL')\n", 128 | "df.show()" 129 | ], 130 | "metadata": { 131 | "colab": { 132 | "base_uri": "https://localhost:8080/" 133 | }, 134 | "id": "FlYG5jeXTrSr", 135 | "outputId": "e11a1247-a83e-4ebe-b0f6-7f53bb6d0ce2" 136 | }, 137 | "execution_count": null, 138 | "outputs": [ 139 | { 140 | "output_type": "stream", 141 | "name": "stdout", 142 | "text": [ 143 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 144 | "|\"ID\" |\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\n", 145 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 146 | "|1 |Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |\n", 147 | "|2 |Male |76 |1 |3.0 |0 |1-2 Year |False |33536.0 |26.0 |183 |0 |\n", 148 | "|3 |Male |47 |1 |28.0 |0 |> 2 Years |True |38294.0 |26.0 |27 |1 |\n", 149 | "|4 |Male |21 |1 |11.0 |1 |< 1 Year |False |28619.0 |152.0 |203 |0 |\n", 150 | "|5 |Female |29 |1 |41.0 |1 |< 1 Year |False |27496.0 |152.0 |39 |0 |\n", 151 | "|6 |Female |24 |1 |33.0 |0 |< 1 Year |True |2630.0 |160.0 |176 |0 |\n", 152 | "|7 |Male |23 |1 |11.0 |0 |< 1 Year |True |23367.0 |152.0 |249 |0 |\n", 153 | "|8 |Female |56 |1 |28.0 |0 |1-2 Year |True |32031.0 |26.0 |72 |1 |\n", 154 | "|9 |Female |24 |1 |3.0 |1 |< 1 Year |False |27619.0 |152.0 |28 |0 |\n", 155 | "|10 |Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |\n", 156 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 157 | "\n" 158 | ] 159 | } 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "source": [ 165 | "# Snowflake Transformation" 166 | ], 167 | "metadata": { 168 | "id": "3n2-ctOPg5HG" 169 | } 170 | }, 171 | { 172 | "cell_type": "code", 173 | "source": [ 174 | "import snowflake.snowpark.functions as F" 175 | ], 176 | "metadata": { 177 | "id": "GQI9PNEDVHBl" 178 | }, 179 | "execution_count": null, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "source": [ 185 | "# Show the Age where Age between 30 and 44\n", 186 | "df_age = df.filter(F.col('AGE').between(30,44))\n", 187 | "df_age.show()" 188 | ], 189 | "metadata": { 190 | "colab": { 191 | "base_uri": "https://localhost:8080/" 192 | }, 193 | "id": "Q6aL1zaLhpyw", 194 | "outputId": "0f315e6c-d52c-4658-bd42-a4389bbf1631" 195 | }, 196 | "execution_count": null, 197 | "outputs": [ 198 | { 199 | "output_type": "stream", 200 | "name": "stdout", 201 | "text": [ 202 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 203 | "|\"ID\" |\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\n", 204 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 205 | "|1 |Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |\n", 206 | "|10 |Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |\n", 207 | "|13 |Female |41 |1 |15.0 |1 |1-2 Year |False |31409.0 |14.0 |221 |0 |\n", 208 | "|16 |Male |37 |1 |6.0 |0 |1-2 Year |True |2630.0 |156.0 |147 |1 |\n", 209 | "|19 |Male |42 |1 |28.0 |0 |1-2 Year |True |33667.0 |124.0 |158 |0 |\n", 210 | "|24 |Male |44 |1 |28.0 |0 |1-2 Year |True |41852.0 |163.0 |60 |0 |\n", 211 | "|25 |Male |34 |1 |15.0 |1 |1-2 Year |False |38111.0 |152.0 |180 |0 |\n", 212 | "|35 |Female |32 |1 |30.0 |1 |< 1 Year |False |27638.0 |152.0 |169 |0 |\n", 213 | "|36 |Male |41 |1 |36.0 |1 |1-2 Year |False |30039.0 |124.0 |88 |0 |\n", 214 | "|41 |Male |30 |1 |30.0 |0 |< 1 Year |True |24550.0 |124.0 |45 |0 |\n", 215 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 216 | "\n" 217 | ] 218 | } 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "source": [ 224 | "# Create an aggregation about Average ANNUAL_PREMIUM for every VEHICLE_AGE\n", 225 | "avg_ann = df.group_by('VEHICLE_AGE').agg(F.avg('ANNUAL_PREMIUM').alias('AVERAGE_ANNUAL_PREMIUM'))\n", 226 | "avg_ann.show()" 227 | ], 228 | "metadata": { 229 | "colab": { 230 | "base_uri": "https://localhost:8080/" 231 | }, 232 | "id": "KrAhFfgVkLu1", 233 | "outputId": "420c4b76-1415-49f9-ccd6-06bb968ec3b6" 234 | }, 235 | "execution_count": null, 236 | "outputs": [ 237 | { 238 | "output_type": "stream", 239 | "name": "stdout", 240 | "text": [ 241 | "--------------------------------------------\n", 242 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM\" |\n", 243 | "--------------------------------------------\n", 244 | "|> 2 Years |35654.4994690 |\n", 245 | "|1-2 Year |30523.5821203 |\n", 246 | "|< 1 Year |30119.5520251 |\n", 247 | "--------------------------------------------\n", 248 | "\n" 249 | ] 250 | } 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "source": [ 256 | "# Simple Multiplication\n", 257 | "mul_col = df.with_column(\"AGE & VINTAGE\", F.col('AGE') * F.col('VINTAGE'))\n", 258 | "mul_col.show()" 259 | ], 260 | "metadata": { 261 | "colab": { 262 | "base_uri": "https://localhost:8080/" 263 | }, 264 | "id": "dWGuzthgoXmu", 265 | "outputId": "8e5a7484-934c-4360-9a2d-5c74ec285fdf" 266 | }, 267 | "execution_count": null, 268 | "outputs": [ 269 | { 270 | "output_type": "stream", 271 | "name": "stdout", 272 | "text": [ 273 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 274 | "|\"ID\" |\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\"AGE & VINTAGE\" |\n", 275 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 276 | "|1 |Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |9548 |\n", 277 | "|2 |Male |76 |1 |3.0 |0 |1-2 Year |False |33536.0 |26.0 |183 |0 |13908 |\n", 278 | "|3 |Male |47 |1 |28.0 |0 |> 2 Years |True |38294.0 |26.0 |27 |1 |1269 |\n", 279 | "|4 |Male |21 |1 |11.0 |1 |< 1 Year |False |28619.0 |152.0 |203 |0 |4263 |\n", 280 | "|5 |Female |29 |1 |41.0 |1 |< 1 Year |False |27496.0 |152.0 |39 |0 |1131 |\n", 281 | "|6 |Female |24 |1 |33.0 |0 |< 1 Year |True |2630.0 |160.0 |176 |0 |4224 |\n", 282 | "|7 |Male |23 |1 |11.0 |0 |< 1 Year |True |23367.0 |152.0 |249 |0 |5727 |\n", 283 | "|8 |Female |56 |1 |28.0 |0 |1-2 Year |True |32031.0 |26.0 |72 |1 |4032 |\n", 284 | "|9 |Female |24 |1 |3.0 |1 |< 1 Year |False |27619.0 |152.0 |28 |0 |672 |\n", 285 | "|10 |Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |2560 |\n", 286 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 287 | "\n" 288 | ] 289 | } 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "source": [ 295 | "# For other Snowpark Functions you can check here :\n", 296 | "\n", 297 | "https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.functions.function" 298 | ], 299 | "metadata": { 300 | "id": "5RvyahLxno7x" 301 | } 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "source": [ 306 | "# Alter (Editing) Existing Dataframe" 307 | ], 308 | "metadata": { 309 | "id": "MqHqX9p_rzs6" 310 | } 311 | }, 312 | { 313 | "cell_type": "code", 314 | "source": [ 315 | "# Rename \"AVERAGE_ANNUAL_PREMIUM\" to be \"AVERAGE_ANNUAL_PREMIUM_ALL_AGE\"\n", 316 | "avg_ann = avg_ann.with_column_renamed(F.col('AVERAGE_ANNUAL_PREMIUM'), 'AVERAGE_ANNUAL_PREMIUM_ALL_AGE')\n", 317 | "avg_ann.show()" 318 | ], 319 | "metadata": { 320 | "colab": { 321 | "base_uri": "https://localhost:8080/" 322 | }, 323 | "id": "_i19UKLOsFnp", 324 | "outputId": "896d0273-da93-4764-93c2-dd858cd3d66d" 325 | }, 326 | "execution_count": null, 327 | "outputs": [ 328 | { 329 | "output_type": "stream", 330 | "name": "stdout", 331 | "text": [ 332 | "----------------------------------------------------\n", 333 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_ALL_AGE\" |\n", 334 | "----------------------------------------------------\n", 335 | "|> 2 Years |35654.4994690 |\n", 336 | "|1-2 Year |30523.5821203 |\n", 337 | "|< 1 Year |30119.5520251 |\n", 338 | "----------------------------------------------------\n", 339 | "\n" 340 | ] 341 | } 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "source": [ 347 | "# Snowflake Drop column" 348 | ], 349 | "metadata": { 350 | "id": "pCfM2Bxytgd7" 351 | } 352 | }, 353 | { 354 | "cell_type": "code", 355 | "source": [ 356 | "df.drop(\"ID\").show()" 357 | ], 358 | "metadata": { 359 | "colab": { 360 | "base_uri": "https://localhost:8080/" 361 | }, 362 | "id": "Wv78trnDtic7", 363 | "outputId": "0879ffc1-ad0e-4622-a74a-a422da61a915" 364 | }, 365 | "execution_count": null, 366 | "outputs": [ 367 | { 368 | "output_type": "stream", 369 | "name": "stdout", 370 | "text": [ 371 | "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 372 | "|\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\n", 373 | "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 374 | "|Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |\n", 375 | "|Male |76 |1 |3.0 |0 |1-2 Year |False |33536.0 |26.0 |183 |0 |\n", 376 | "|Male |47 |1 |28.0 |0 |> 2 Years |True |38294.0 |26.0 |27 |1 |\n", 377 | "|Male |21 |1 |11.0 |1 |< 1 Year |False |28619.0 |152.0 |203 |0 |\n", 378 | "|Female |29 |1 |41.0 |1 |< 1 Year |False |27496.0 |152.0 |39 |0 |\n", 379 | "|Female |24 |1 |33.0 |0 |< 1 Year |True |2630.0 |160.0 |176 |0 |\n", 380 | "|Male |23 |1 |11.0 |0 |< 1 Year |True |23367.0 |152.0 |249 |0 |\n", 381 | "|Female |56 |1 |28.0 |0 |1-2 Year |True |32031.0 |26.0 |72 |1 |\n", 382 | "|Female |24 |1 |3.0 |1 |< 1 Year |False |27619.0 |152.0 |28 |0 |\n", 383 | "|Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |\n", 384 | "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", 385 | "\n" 386 | ] 387 | } 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "source": [ 393 | "# Join the table" 394 | ], 395 | "metadata": { 396 | "id": "HZ39g44VtVOC" 397 | } 398 | }, 399 | { 400 | "cell_type": "code", 401 | "source": [ 402 | "# We have to make 2nd aggregation dataframe\n", 403 | "avg_ann2 = df_age.group_by('VEHICLE_AGE').agg(F.avg('ANNUAL_PREMIUM').alias('AVERAGE_ANNUAL_PREMIUM_30_TO_44'))\n", 404 | "avg_ann2.show()" 405 | ], 406 | "metadata": { 407 | "colab": { 408 | "base_uri": "https://localhost:8080/" 409 | }, 410 | "id": "FHO9Xi03tXCD", 411 | "outputId": "cf36e6a0-91fc-4458-b7fb-a421c5ac3725" 412 | }, 413 | "execution_count": null, 414 | "outputs": [ 415 | { 416 | "output_type": "stream", 417 | "name": "stdout", 418 | "text": [ 419 | "-----------------------------------------------------\n", 420 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_30_TO_44\" |\n", 421 | "-----------------------------------------------------\n", 422 | "|> 2 Years |33157.8273078 |\n", 423 | "|< 1 Year |27853.8153776 |\n", 424 | "|1-2 Year |28789.0972791 |\n", 425 | "-----------------------------------------------------\n", 426 | "\n" 427 | ] 428 | } 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "source": [ 434 | "join_df = avg_ann.join(avg_ann2, \"VEHICLE_AGE\").select(avg_ann.VEHICLE_AGE.alias(\"VEHICLE_AGE\"),\n", 435 | " avg_ann.AVERAGE_ANNUAL_PREMIUM_ALL_AGE,\n", 436 | " avg_ann2.AVERAGE_ANNUAL_PREMIUM_30_TO_44)\n", 437 | "\n", 438 | "join_df.show()" 439 | ], 440 | "metadata": { 441 | "colab": { 442 | "base_uri": "https://localhost:8080/" 443 | }, 444 | "id": "zc1OkGCWt5Ie", 445 | "outputId": "1888e742-f3c3-4b14-9196-b65e76379704" 446 | }, 447 | "execution_count": null, 448 | "outputs": [ 449 | { 450 | "output_type": "stream", 451 | "name": "stdout", 452 | "text": [ 453 | "----------------------------------------------------------------------------------------\n", 454 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_ALL_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_30_TO_44\" |\n", 455 | "----------------------------------------------------------------------------------------\n", 456 | "|> 2 Years |35654.4994690 |33157.8273078 |\n", 457 | "|1-2 Year |30523.5821203 |28789.0972791 |\n", 458 | "|< 1 Year |30119.5520251 |27853.8153776 |\n", 459 | "----------------------------------------------------------------------------------------\n", 460 | "\n" 461 | ] 462 | } 463 | ] 464 | } 465 | ] 466 | } 467 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/README.md: -------------------------------------------------------------------------------- 1 | # Streamlit Web Application Project with Google Gemini 2 | This repository is for all of my Streamlit Web Application that created by myself. Use this code for your reference only, modify this code if you want to use the real time poject 3 | 4 | ## 1. Auto Sentiment Analysis Twitter (Google Gemini) 5 | This project allow the user to do auto sentiment analysis from your twitter dataset and then visualize the result with Wordcloud and Bi-Gram Visualization. After that the Google Gemini will make a conclusion and actionable insight based on the visualization 6 | 7 | ## 2. Chat With Your CSV (Google Gemini) 8 | This project allow the user to analyze their own CSV dataset. The user should input the target variable and columns for analysis (attribute) for Data Visualization. There are 4 Visualization on this project : Countplot Visualization, Histoplot Visualization, Multiclass Countplot Visualization, Multiclass Histoplot Visualization. After that, the user can chat with Google Gemini about all of the visualized data 9 | 10 | ## 3. CheatGPT (Google Gemini) 11 | This project allow the user to upload their image, and then Google Gemini will answer your question based on the uploaded image. You only need to screenshot the exam question to do this 12 | 13 | ## 4. Complete Pack 14 | This project is actually the complete pack for all of the Data Science project. There are : Machine Learning Classification Prediction, Machine Learning Regresion Prediction, PDF Document Analysis, Sentiment Analysis, CSV File Analysis, Clustering, EDA With Google Gemini 15 | 16 | ## 5. E-Commerce Clustering (Google Gemini, K-Means) 17 | This project allow the user to do clustering method from their CSV File. First thing they have to do is upload a CSV File, then pick 3 numerical column for clustering. After that the user need to define how many cluster that they want. The last step is the system will give the 3D Clustering Visualization and the Google Gemini will give some response based on the 3D Clustering result 18 | 19 | ## 6. Fraud Analysis (Google Gemini) 20 | This project is actually for my Google Gen AI Hackathon (Hack2Skill). The user only need to upload their fraud csv dataset after that the user should inpput the target variable and some column for analysis (attribute). After that the Google Gemini will give 4 Visualization, they are : Countplot Visualization, Histoplot Visualization, Multiclass Countplot Visualization, Multiclass Histoplot Visualization. The Google Gemini will give some Conclusion and Actionable Insight each Visualization 21 | 22 | ## 7. PDF Document Analysis (Google Gemini) 23 | This project allow the user to analyse their PDF File. The user only need to upload their PDF File and put some additional sopwords for data cleansing. After that the system will give the Wordcloud Visualization and Bi-Gram Visualization and lastly Google Gemini will give some conclusion and Actionable Insight based on each Visualization 24 | 25 | ## 8. Table Scraper Analysis (Google Gemini, BeautifulSoup) 26 | This project allow the user to analyze the table from the selected website link. The first thing the users have to do is they have to put the link to analyze and then the system will show all of the available table from the website. After that the user should select column for analysis and remove the unwanted rows. Lastly the Google Gemini will analyze the selected table and give some conclusion and actioonable insight based on the table 27 | 28 | 29 | ## 9. PDF Document Comparison (Google Gemini, Cosine Simmilarity) 30 | This project allow the user to compare 2 PDF Document File and then the system will give the similarity percentage with cosine similarity. The system also will show Word cloud and Bi-Gram Visualization for each documents. Lastly the Google Gini will analyze both Document and give conclusion about the similarities and differences from both documents 31 | 32 | 33 | ## 10. CT Scan and MRI Diagnosis Explanator 34 | This web Application allows to help the doctor and medical officer to analyze about the result of patient CT Scan and MRI image and give some potential abnormalies 35 | 36 | ## 11. LLM Pandas AI and Google Gemini Analysis 37 | This web Application allows to analyze your CSV Dataset and let the user ask anything about their Dataset, then PandasAI will give the answer based on the user question (answers can be dataframe or visualization) and lastly Google Gemini will give some explanation if the answer is a visualization 38 | 39 | ## 12. PDF Documents Comparer 40 | This web Application allows to analyze your PDF File with Langchain and Google Gemini. The users allows to upload 2 PDF File and then ask any question about both PDF File. Then Google Gemini wull analyze the documents 41 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/__pycache__/flask.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Streamlit-Web-Application-main/__pycache__/flask.cpython-311.pyc -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/__pycache__/pandasai.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Streamlit-Web-Application-main/__pycache__/pandasai.cpython-311.pyc -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/auto_sentiment_analysis_twitter.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | sns.set_theme(color_codes=True) 5 | import os 6 | import pathlib 7 | import textwrap 8 | import google.generativeai as genai 9 | from IPython.display import display 10 | from IPython.display import Markdown 11 | import PIL.Image 12 | 13 | st.title("Sentiment Analysis") 14 | 15 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 16 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 17 | from wordcloud import WordCloud 18 | import PyPDF2 19 | import re 20 | from io import StringIO 21 | import plotly.express as px 22 | import pandas as pd 23 | import collections 24 | from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification 25 | 26 | # Create stemmer 27 | factory = StemmerFactory() 28 | stemmer = factory.create_stemmer() 29 | 30 | # Create stopword remover 31 | stop_factory = StopWordRemoverFactory() 32 | more_stopword = ['dengan', 'ia', 'bahwa', 'oleh', 'rp', 'undang', 'pasal', 'ayat', 'bab'] 33 | data = stop_factory.get_stop_words() + more_stopword 34 | 35 | # Upload the CSV file 36 | uploaded_file = st.file_uploader("Upload CSV file:") 37 | 38 | # User input for delimiter 39 | delimiter_option = st.radio("Select CSV delimiter:", [",", ";"], index=0) 40 | 41 | # Add custom stopwords 42 | custom_stopwords = st.text_input("Enter custom stopwords (comma-separated):") 43 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(",")] if custom_stopwords else [] 44 | 45 | # Check if the file is uploaded 46 | if uploaded_file is not None: 47 | # Read the CSV file into a Pandas DataFrame 48 | if delimiter_option == ",": 49 | df = pd.read_csv(uploaded_file, delimiter=",") 50 | elif delimiter_option == ";": 51 | df = pd.read_csv(uploaded_file, delimiter=";") 52 | else: 53 | st.error("Invalid delimiter option.") 54 | 55 | # Show the DataFrame 56 | st.dataframe(df) 57 | 58 | # Select a column for sentiment analysis 59 | object_columns = df.select_dtypes(include="object").columns 60 | target_variable = st.selectbox("Choose a column for Sentiment Analysis:", object_columns) 61 | 62 | # Perform sentiment analysis on the selected column 63 | if st.button("Perform Sentiment Analysis"): 64 | # Your sentiment analysis logic goes here 65 | st.success(f"Sentiment Analysis performed on column: {target_variable}") 66 | 67 | # Show the selected column 68 | st.write(f"Selected {target_variable} Column:") 69 | st.dataframe(df[[target_variable]]) 70 | 71 | # Create a new DataFrame with cleaned text column 72 | new_df = df.copy() 73 | 74 | # Create cleaned text column (updated to include custom stopwords) 75 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(",")] if custom_stopwords else [] 76 | new_df['cleaned_text'] = new_df[target_variable].apply(lambda x: ' '.join( 77 | [stemmer.stem(word) for word in stop_factory.create_stop_word_remover().remove(x).split() 78 | if word.lower() not in data and word.lower() not in custom_stopword_list] # Exclude custom stopwords 79 | )) 80 | 81 | # Apply stemming and stopword removal to the selected column 82 | new_df['cleaned_text'] = new_df[target_variable].apply(lambda x: ' '.join([stemmer.stem(word) for word in stop_factory.create_stop_word_remover().remove(x).split() if word.lower() not in data])) 83 | 84 | # Show the cleaned text column 85 | #st.write("Cleaned Text Column:") 86 | #st.dataframe(new_df[['cleaned_text']]) 87 | 88 | # Load the sentiment analysis pipeline 89 | pretrained = "indonesia-bert-sentiment-classification" 90 | model = AutoModelForSequenceClassification.from_pretrained(pretrained) 91 | tokenizer = AutoTokenizer.from_pretrained(pretrained) 92 | sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) 93 | label_index = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'} 94 | 95 | # Function to apply sentiment analysis to each row in the 'cleaned_text' column 96 | def analyze_sentiment(text): 97 | result = sentiment_analysis(text) 98 | label = label_index[result[0]['label']] 99 | score = result[0]['score'] 100 | return pd.Series({'sentiment_label': label, 'sentiment_score': score}) 101 | 102 | # Apply sentiment analysis to 'cleaned_text' column 103 | new_df[['sentiment_label', 'sentiment_score']] = new_df['cleaned_text'].apply(analyze_sentiment) 104 | 105 | # Display the results 106 | st.write("Sentiment Analysis Results:") 107 | st.dataframe(new_df[['cleaned_text', 'sentiment_label', 'sentiment_score']]) 108 | 109 | # Count the occurrences of each sentiment label 110 | sentiment_counts = new_df['sentiment_label'].value_counts() 111 | 112 | # Plot a bar chart using seaborn 113 | st.set_option('deprecation.showPyplotGlobalUse', False) 114 | sns.set(style="whitegrid") 115 | plt.figure(figsize=(8, 6)) 116 | sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis") 117 | plt.title('Sentiment Distribution') 118 | plt.xlabel('Sentiment Label') 119 | plt.ylabel('Count') 120 | st.pyplot() 121 | 122 | # Define a dictionary to store sentiment-wise text 123 | sentiment_text = { 124 | "positive": "", 125 | "neutral": "", 126 | "negative": "" 127 | } 128 | 129 | # Loop through each sentiment label 130 | for label in sentiment_counts.index: 131 | # Filter data for the current sentiment 132 | selected_data = new_df[new_df['sentiment_label'] == label] 133 | 134 | # Include custom stopwords back into the cleaned text before concatenation 135 | selected_data['cleaned_text'] = selected_data['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in data and word.lower() not in custom_stopword_list])) # Remove only general stopwords 136 | 137 | # Concatenate cleaned text from the selected data (now including custom stopwords) 138 | sentiment_text[label] = ' '.join(selected_data['cleaned_text'].astype(str)) 139 | 140 | 141 | # Define variables for sentiment-wise text (adjust variable names) 142 | #positive_text = "" 143 | #neutral_text = "" 144 | #negative_text = "" 145 | 146 | 147 | # Concatenate cleaned text for each sentiment 148 | positive_text = ' '.join([word for word in new_df[new_df['sentiment_label'] == 'positive']['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in data and w.lower() not in custom_stopword_list]))]) 149 | neutral_text = ' '.join([word for word in new_df[new_df['sentiment_label'] == 'neutral']['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in data and w.lower() not in custom_stopword_list]))]) 150 | negative_text = ' '.join([word for word in new_df[new_df['sentiment_label'] == 'negative']['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in data and w.lower() not in custom_stopword_list]))]) 151 | 152 | 153 | 154 | # Generate WordCloud for positive sentiment 155 | positive_wordcloud = WordCloud( 156 | min_font_size=3, max_words=200, width=800, height=400, 157 | colormap='viridis', background_color='white' 158 | ).generate(positive_text) 159 | 160 | # Save the WordCloud image with a filename 161 | positive_wordcloud_filename = "wordcloud_positive.png" 162 | positive_wordcloud.to_file(positive_wordcloud_filename) 163 | 164 | # Display the saved WordCloud image using Streamlit 165 | st.subheader("WordCloud for Positive Sentiment") 166 | st.image(positive_wordcloud_filename) 167 | 168 | def to_markdown(text): 169 | text = text.replace('•', ' *') 170 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 171 | 172 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 173 | 174 | import PIL.Image 175 | 176 | img = PIL.Image.open("wordcloud_positive.png") 177 | model = genai.GenerativeModel('gemini-pro-vision') 178 | response = model.generate_content(img) 179 | 180 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about wordcloud positive sentiment", img]) 181 | response.resolve() 182 | st.write("**Google Gemini Response About Data**") 183 | st.write(response.text) 184 | 185 | 186 | # Generate WordCloud for neutral sentiment 187 | neutral_wordcloud = WordCloud( 188 | min_font_size=3, max_words=200, width=800, height=400, 189 | colormap='viridis', background_color='white' 190 | ).generate(neutral_text) 191 | 192 | # Save the WordCloud image with a filename 193 | neutral_wordcloud_filename = "wordcloud_neutral.png" 194 | neutral_wordcloud.to_file(neutral_wordcloud_filename) 195 | 196 | # Display the saved WordCloud image using Streamlit 197 | st.subheader("WordCloud for Neutral Sentiment") 198 | st.image(neutral_wordcloud_filename) 199 | 200 | def to_markdown(text): 201 | text = text.replace('•', ' *') 202 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 203 | 204 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 205 | 206 | import PIL.Image 207 | 208 | img = PIL.Image.open("wordcloud_neutral.png") 209 | model = genai.GenerativeModel('gemini-pro-vision') 210 | response = model.generate_content(img) 211 | 212 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about wordcloud neutral sentiment", img]) 213 | response.resolve() 214 | st.write("**Google Gemini Response About Data**") 215 | st.write(response.text) 216 | 217 | 218 | 219 | # Generate WordCloud for negative sentiment 220 | negative_wordcloud = WordCloud( 221 | min_font_size=3, max_words=200, width=800, height=400, 222 | colormap='viridis', background_color='white' 223 | ).generate(negative_text) 224 | 225 | # Save the WordCloud image with a filename 226 | negative_wordcloud_filename = "wordcloud_negative.png" 227 | negative_wordcloud.to_file(negative_wordcloud_filename) 228 | 229 | # Display the saved WordCloud image using Streamlit 230 | st.subheader("WordCloud for Negative Sentiment") 231 | st.image(negative_wordcloud_filename) 232 | 233 | def to_markdown(text): 234 | text = text.replace('•', ' *') 235 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 236 | 237 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 238 | 239 | import PIL.Image 240 | 241 | img = PIL.Image.open("wordcloud_negative.png") 242 | model = genai.GenerativeModel('gemini-pro-vision') 243 | response = model.generate_content(img) 244 | 245 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about wordcloud negative sentiment", img]) 246 | response.resolve() 247 | st.write("**Google Gemini Response About Data**") 248 | st.write(response.text) 249 | 250 | 251 | # Bigrams Positive Sentiment 252 | words1 = positive_text.split() 253 | # Get bigrams 254 | bigrams = list(zip(words1, words1[1:])) 255 | 256 | # Count bigrams 257 | bigram_counts = collections.Counter(bigrams) 258 | 259 | # Get top 10 bigram counts 260 | top_bigrams = dict(bigram_counts.most_common(10)) 261 | 262 | # Create bar chart 263 | plt.figure(figsize=(10, 7)) 264 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 265 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 266 | plt.xlabel('Bigram Words') 267 | plt.ylabel('Count') 268 | plt.title(f"Top 10 Bigram for Positive Sentiment") 269 | # Save the entire plot as a PNG 270 | plt.tight_layout() 271 | plt.savefig("bigram_positive.png") 272 | st.subheader("Bigram for Positive Sentiment") 273 | st.image("bigram_positive.png") 274 | 275 | def to_markdown(text): 276 | text = text.replace('•', ' *') 277 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 278 | 279 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 280 | 281 | import PIL.Image 282 | 283 | img = PIL.Image.open("bigram_positive.png") 284 | model = genai.GenerativeModel('gemini-pro-vision') 285 | response = model.generate_content(img) 286 | 287 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about bigram positive sentiment", img]) 288 | response.resolve() 289 | st.write("**Google Gemini Response About Data**") 290 | st.write(response.text) 291 | 292 | 293 | 294 | # Bigrams Neutral Sentiment 295 | words1 = neutral_text.split() 296 | # Get bigrams 297 | bigrams = list(zip(words1, words1[1:])) 298 | 299 | # Count bigrams 300 | bigram_counts = collections.Counter(bigrams) 301 | 302 | # Get top 10 bigram counts 303 | top_bigrams = dict(bigram_counts.most_common(10)) 304 | 305 | # Create bar chart 306 | plt.figure(figsize=(10, 7)) 307 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 308 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 309 | plt.xlabel('Bigram Words') 310 | plt.ylabel('Count') 311 | plt.title(f"Top 10 Bigram for Neutral Sentiment") 312 | # Save the entire plot as a PNG 313 | plt.tight_layout() 314 | plt.savefig("bigram_neutral.png") 315 | st.subheader("Bigram for Neutral Sentiment") 316 | st.image("bigram_neutral.png") 317 | 318 | def to_markdown(text): 319 | text = text.replace('•', ' *') 320 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 321 | 322 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 323 | 324 | import PIL.Image 325 | 326 | img = PIL.Image.open("bigram_neutral.png") 327 | model = genai.GenerativeModel('gemini-pro-vision') 328 | response = model.generate_content(img) 329 | 330 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about bigram neutral sentiment", img]) 331 | response.resolve() 332 | st.write("**Google Gemini Response About Data**") 333 | st.write(response.text) 334 | 335 | 336 | 337 | # Bigrams Negative Sentiment 338 | words1 = negative_text.split() 339 | # Get bigrams 340 | bigrams = list(zip(words1, words1[1:])) 341 | 342 | # Count bigrams 343 | bigram_counts = collections.Counter(bigrams) 344 | 345 | # Get top 10 bigram counts 346 | top_bigrams = dict(bigram_counts.most_common(10)) 347 | 348 | # Create bar chart 349 | plt.figure(figsize=(10, 7)) 350 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 351 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 352 | plt.xlabel('Bigram Words') 353 | plt.ylabel('Count') 354 | plt.title(f"Top 10 Bigram for negative Sentiment") 355 | # Save the entire plot as a PNG 356 | plt.tight_layout() 357 | plt.savefig("bigram_negative.png") 358 | st.subheader("Bigram for Negative Sentiment") 359 | st.image("bigram_negative.png") 360 | 361 | def to_markdown(text): 362 | text = text.replace('•', ' *') 363 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 364 | 365 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 366 | 367 | import PIL.Image 368 | 369 | img = PIL.Image.open("bigram_negative.png") 370 | model = genai.GenerativeModel('gemini-pro-vision') 371 | response = model.generate_content(img) 372 | 373 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about bigram negative sentiment", img]) 374 | response.resolve() 375 | st.write("**Google Gemini Response About Data**") 376 | st.write(response.text) 377 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/chat_with_your_csv.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import random 3 | import time 4 | import pandas as pd 5 | import seaborn as sns 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | from scipy import stats 9 | import warnings 10 | sns.set_theme(color_codes=True) 11 | import os 12 | import pathlib 13 | import textwrap 14 | import google.generativeai as genai 15 | from IPython.display import display 16 | from IPython.display import Markdown 17 | import time 18 | 19 | 20 | st.title("EDA with Google Gemini") 21 | 22 | # Upload the CSV file 23 | uploaded_file = st.file_uploader("Upload CSV file:") 24 | 25 | # Check if the file is uploaded 26 | if uploaded_file is not None: 27 | # Read the CSV file into a Pandas DataFrame 28 | df = pd.read_csv(uploaded_file) 29 | 30 | # Show the original DataFrame 31 | st.write("Original DataFrame:") 32 | st.dataframe(df) 33 | 34 | 35 | 36 | st.write("**Countplot Barchart**") 37 | 38 | # Get the names of all columns with data type 'object' (categorical columns) excluding 'Country' 39 | cat_vars = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() > 1 and df[col].nunique() <= 10] 40 | 41 | # Create a figure with subplots 42 | num_cols = len(cat_vars) 43 | num_rows = (num_cols + 2) // 3 44 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 45 | axs = axs.flatten() 46 | 47 | # Create a countplot for the top 10 values of each categorical variable using Seaborn 48 | for i, var in enumerate(cat_vars): 49 | top_values = df[var].value_counts().head(10).index 50 | filtered_df = df.copy() 51 | filtered_df[var] = df[var].apply(lambda x: x if x in top_values else 'Other') 52 | sns.countplot(x=var, data=filtered_df, ax=axs[i]) 53 | axs[i].set_title(var) 54 | axs[i].tick_params(axis='x', rotation=90) 55 | 56 | # Remove any extra empty subplots if needed 57 | if num_cols < len(axs): 58 | for i in range(num_cols, len(axs)): 59 | fig.delaxes(axs[i]) 60 | 61 | # Adjust spacing between subplots 62 | fig.tight_layout() 63 | 64 | # Show plots using Streamlit 65 | st.pyplot(fig) 66 | fig.savefig("plot4.png") 67 | 68 | 69 | 70 | st.write("**Histoplot**") 71 | # Get the names of all columns with data type 'int' or 'float' 72 | num_vars = [col for col in df.select_dtypes(include=['int', 'float']).columns] 73 | 74 | # Create a figure with subplots 75 | num_cols = len(num_vars) 76 | num_rows = (num_cols + 2) // 3 77 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 78 | axs = axs.flatten() 79 | 80 | # Create a histplot for each numeric variable using Seaborn 81 | for i, var in enumerate(num_vars): 82 | sns.histplot(df[var], ax=axs[i], kde=True) 83 | axs[i].set_title(var) 84 | axs[i].set_xlabel('') 85 | 86 | # Remove any extra empty subplots if needed 87 | if num_cols < len(axs): 88 | for i in range(num_cols, len(axs)): 89 | fig.delaxes(axs[i]) 90 | 91 | # Adjust spacing between subplots 92 | fig.tight_layout() 93 | 94 | # Show plots using Streamlit 95 | st.pyplot(fig) 96 | fig.savefig("plot7.png") 97 | 98 | 99 | 100 | # Select target variable 101 | target_variable = st.selectbox("Select target variable:", df.columns) 102 | 103 | # Select columns for analysis 104 | columns_for_analysis = st.multiselect("Select columns for analysis:", [col for col in df.columns if col != target_variable]) 105 | 106 | # Process button 107 | if st.button("Process"): 108 | # Select the target variable and columns for analysis from the original DataFrame 109 | target_variable_data = df[target_variable] 110 | columns_for_analysis_data = df[columns_for_analysis] 111 | 112 | # Display target variable in a dataframe 113 | target_variable_df = df[[target_variable]] 114 | st.write("Target Variable DataFrame:") 115 | st.dataframe(target_variable_df) 116 | 117 | # Display columns for analysis in a dataframe 118 | columns_for_analysis_df = df[columns_for_analysis] 119 | st.write("Columns for Analysis DataFrame:") 120 | st.dataframe(columns_for_analysis_df) 121 | 122 | # Concatenate target variable and columns for analysis into a single DataFrame 123 | df = pd.concat([target_variable_data, columns_for_analysis_data], axis=1) 124 | st.write("Columns for Analysis and Target Variable DataFrame:") 125 | st.dataframe(df) 126 | 127 | # Drop columns with null values more than 25% 128 | null_percentage = df.isnull().sum() / len(df) 129 | columns_to_drop = null_percentage[null_percentage > 0.25].index 130 | df.drop(columns=columns_to_drop, inplace=True) 131 | 132 | # Fill missing values below 25% with median 133 | for col in df.columns: 134 | if df[col].isnull().sum() > 0: # Check if there are missing values 135 | if null_percentage[col] <= 0.25: 136 | if df[col].dtype in ['float64', 'int64']: # Check if missing values are below 25% 137 | median_value = df[col].median() # Calculate median for the column 138 | df[col].fillna(median_value, inplace=True) 139 | 140 | # Convert object datatype columns to lowercase 141 | for col in df.columns: 142 | if df[col].dtype == 'object': # Check if datatype is object 143 | df[col] = df[col].str.lower() # Convert values to lowercase 144 | 145 | st.write("Cleaned Dataset") 146 | st.dataframe(df) 147 | 148 | 149 | st.write("**Multiclass Barplot**") 150 | # Get the names of all columns with data type 'object' (categorical columns) 151 | cat_cols = df.columns.tolist() 152 | 153 | # Get the names of all columns with data type 'object' (categorical variables) 154 | cat_vars = df.select_dtypes(include=['object']).columns.tolist() 155 | 156 | # Exclude 'Country' from the list if it exists in cat_vars 157 | if target_variable in cat_vars: 158 | cat_vars.remove(target_variable) 159 | 160 | # Create a figure with subplots, but only include the required number of subplots 161 | num_cols = len(cat_vars) 162 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots 163 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 164 | axs = axs.flatten() 165 | 166 | # Create a count plot for each categorical variable 167 | for i, var in enumerate(cat_vars): 168 | top_categories = df[var].value_counts().nlargest(10).index 169 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)] # Exclude rows with NaN values in the variable 170 | sns.countplot(x=var, hue=target_variable, data=filtered_df, ax=axs[i]) 171 | axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90) 172 | 173 | # Remove any remaining blank subplots 174 | for i in range(num_cols, len(axs)): 175 | fig.delaxes(axs[i]) 176 | 177 | # Adjust spacing between subplots 178 | fig.tight_layout() 179 | 180 | # Show plot 181 | st.pyplot(fig) 182 | fig.savefig("plot2.png") 183 | 184 | 185 | 186 | 187 | st.write("**Multiclass Histplot**") 188 | # Get the names of all columns with data type 'object' (categorical columns) 189 | cat_cols = df.columns.tolist() 190 | 191 | # Get the names of all columns with data type 'int' 192 | int_vars = df.select_dtypes(include=['int', 'float']).columns.tolist() 193 | int_vars = [col for col in int_vars if col != target_variable] 194 | 195 | # Create a figure with subplots 196 | num_cols = len(int_vars) 197 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots 198 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 199 | axs = axs.flatten() 200 | 201 | # Create a histogram for each integer variable with hue='Attrition' 202 | for i, var in enumerate(int_vars): 203 | top_categories = df[var].value_counts().nlargest(10).index 204 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)] 205 | sns.histplot(data=df, x=var, hue=target_variable, kde=True, ax=axs[i]) 206 | axs[i].set_title(var) 207 | 208 | # Remove any extra empty subplots if needed 209 | if num_cols < len(axs): 210 | for i in range(num_cols, len(axs)): 211 | fig.delaxes(axs[i]) 212 | 213 | # Adjust spacing between subplots 214 | fig.tight_layout() 215 | 216 | # Show plot 217 | st.pyplot(fig) 218 | fig.savefig("plot3.png") 219 | 220 | 221 | # Define the paths to the saved plots 222 | plot_paths = ["plot4.png", "plot7.png", "plot2.png", "plot3.png"] 223 | 224 | # Create a new figure 225 | fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 15)) 226 | 227 | # Iterate over each plot path and place it in the corresponding subplot 228 | for i, plot_path in enumerate(plot_paths): 229 | row = i // 2 230 | col = i % 2 231 | img = plt.imread(plot_path) 232 | axs[row, col].imshow(img) 233 | axs[row, col].axis('off') 234 | 235 | # Adjust spacing between subplots 236 | plt.tight_layout() 237 | 238 | # Save the merged plot 239 | fig.savefig("merged_plots.png") 240 | 241 | # Streamed response emulator 242 | 243 | def to_markdown(text): 244 | text = text.replace('•', ' *') 245 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 246 | 247 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 248 | 249 | import PIL.Image 250 | 251 | img = PIL.Image.open("merged_plots.png") 252 | model = genai.GenerativeModel('gemini-pro-vision') 253 | response = model.generate_content(img) 254 | 255 | def response_generator(): 256 | response = response.text 257 | 258 | 259 | for word in response.split(): 260 | yield word + " " 261 | time.sleep(0.05) 262 | 263 | 264 | st.title("Chat with your Data") 265 | 266 | # Initialize chat history 267 | if "messages" not in st.session_state: 268 | st.session_state.messages = [] 269 | 270 | # Display chat messages from history on app rerun 271 | for message in st.session_state.messages: 272 | with st.chat_message(message["role"]): 273 | st.markdown(message["content"]) 274 | 275 | # Accept user input 276 | if prompt := st.chat_input("Ask Your Data"): 277 | # Add user message to chat history 278 | st.session_state.messages.append({"role": "user", "content": prompt}) 279 | # Display user message in chat message container 280 | with st.chat_message("user"): 281 | st.markdown(prompt) 282 | 283 | # Generate Google Gemini response based on user's question 284 | img = PIL.Image.open("merged_plots.png") 285 | model = genai.GenerativeModel('gemini-pro-vision') 286 | response = model.generate_content([prompt, img], stream=True) 287 | response.resolve() 288 | 289 | # Format and display the response 290 | response_text = response.text 291 | response_markdown = to_markdown(response_text) 292 | st.write(response.text) 293 | 294 | # Add assistant response to chat history 295 | st.session_state.messages.append({"role": "assistant", "content": response_text}) 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/cheatgpt.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from PIL import Image 3 | import io 4 | import textwrap 5 | import google.generativeai as genai 6 | from IPython.display import display 7 | from IPython.display import Markdown 8 | 9 | st.title("CheatGPT") 10 | 11 | uploaded_file = st.file_uploader("Upload your PNG or JPG image:", type=["png", "jpg"]) 12 | 13 | if uploaded_file is not None: 14 | 15 | # Validate the file extension 16 | if uploaded_file.type in ["image/png", "image/jpeg"]: 17 | # Read the image bytes 18 | img_bytes = uploaded_file.read() 19 | 20 | # Convert bytes to PIL Image object 21 | img = Image.open(io.BytesIO(img_bytes)) 22 | st.write("Image Uploaded") 23 | st.image(img) 24 | 25 | img.save("image.png") 26 | 27 | def to_markdown(text): 28 | text = text.replace('•', ' *') 29 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 30 | 31 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 32 | 33 | import PIL.Image 34 | 35 | img1 = PIL.Image.open("image.png") 36 | model = genai.GenerativeModel('gemini-pro-vision') 37 | response = model.generate_content(img) 38 | 39 | response = model.generate_content(["Answer This Question and give the explanation", img1], stream=True) 40 | response.resolve() 41 | st.write("**Google Gemini Response About Data**") 42 | st.write(response.text) 43 | 44 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/compare.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import streamlit as st 4 | from langchain.chains import StuffDocumentsChain 5 | from langchain.chains.llm import LLMChain 6 | from langchain.prompts import PromptTemplate 7 | from langchain_community.document_loaders import PyPDFLoader 8 | from langchain_google_genai import ChatGoogleGenerativeAI 9 | 10 | # Title of the app 11 | st.title("PDF Document Comparer Analysis") 12 | 13 | # Upload the PDF files 14 | uploaded_file1 = st.file_uploader("Upload First PDF file:", type='pdf') 15 | uploaded_file2 = st.file_uploader("Upload Second PDF file:", type='pdf') 16 | question = st.text_input("Insert Question", "Put your question here about both documents") 17 | 18 | async def process_files(): 19 | if uploaded_file1 and uploaded_file2 and question: 20 | # Save the uploaded files as file1.pdf and file2.pdf 21 | file1_path = "file1.pdf" 22 | file2_path = "file2.pdf" 23 | with open(file1_path, "wb") as f1: 24 | f1.write(uploaded_file1.getbuffer()) 25 | with open(file2_path, "wb") as f2: 26 | f2.write(uploaded_file2.getbuffer()) 27 | 28 | # Initialize the LLM with the Google API key 29 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyCFI6cTqFdS-mpZBfi7kxwygewtnuF7PfA") 30 | 31 | # Load the PDF files 32 | loader1 = PyPDFLoader(file1_path) 33 | loader2 = PyPDFLoader(file2_path) 34 | docs1 = loader1.load() 35 | docs2 = loader2.load() 36 | docs3 = docs1 + docs2 37 | 38 | # Define the Summarize Chain 39 | template = """Write a concise summary of the following: 40 | "{text}" 41 | CONCISE SUMMARY:""" 42 | prompt = PromptTemplate.from_template(template) 43 | llm_chain = LLMChain(llm=llm, prompt=prompt) 44 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text") 45 | 46 | # Process both documents 47 | response1 = stuff_chain.invoke(docs1) 48 | response2 = stuff_chain.invoke(docs2) 49 | 50 | # Display the summaries 51 | st.markdown("### Summary of the First Document") 52 | st.write(response1["output_text"]) 53 | 54 | st.markdown("### Summary of the Second Document") 55 | st.write(response2["output_text"]) 56 | 57 | # Additional comparison logic can be added here based on the question 58 | comparison_template = question + """Write a concise summary of the following: 59 | "{text}" 60 | CONCISE SUMMARY:""" 61 | 62 | prompt1 = PromptTemplate.from_template(comparison_template) 63 | llm_chain1 = LLMChain(llm=llm, prompt=prompt1) 64 | stuff_chain1 = StuffDocumentsChain(llm_chain=llm_chain1, document_variable_name="text") 65 | response3 = stuff_chain1.invoke(docs3) 66 | 67 | # Display the comparison result 68 | st.markdown("### Comparison Result") 69 | st.write(response3["output_text"]) 70 | 71 | # Clean up the temporary files 72 | os.remove(uploaded_file1.name) 73 | os.remove(uploaded_file2.name) 74 | 75 | if st.button("Process"): 76 | asyncio.run(process_files()) 77 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/diagnosis.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import datetime 3 | import os 4 | import PIL.Image 5 | import google.generativeai as genai 6 | from IPython.display import Markdown 7 | import time 8 | import io 9 | from PIL import Image 10 | import textwrap 11 | 12 | # Replace with your GenerativeAI API key 13 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 14 | 15 | st.title("CT Scan and MRI Diagnosis Explanator") 16 | 17 | # Initialize chat history 18 | if "messages" not in st.session_state: 19 | st.session_state.messages = [] 20 | 21 | # Display chat messages from history on app rerun 22 | for message in st.session_state.messages: 23 | with st.chat_message(message["role"]): 24 | st.markdown(message["content"]) 25 | 26 | # Upload an image file 27 | uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"]) 28 | 29 | if uploaded_file is not None: 30 | if uploaded_file.type in ["image/png", "image/jpeg"]: 31 | img_bytes = uploaded_file.read() 32 | img = Image.open(io.BytesIO(img_bytes)) 33 | st.write("Image Uploaded") 34 | st.image(img) 35 | 36 | img.save("image.png") 37 | 38 | def to_markdown(text): # Consider removing if formatting not needed 39 | text = text.replace('•', '  *') 40 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 41 | 42 | model = genai.GenerativeModel('gemini-pro-vision') # Check supported models 43 | response = model.generate_content(["Can you analyze this CT scan or MRI and explain any potential abnormalities?", img], stream=True) 44 | response.resolve() 45 | 46 | st.write("**Google Gemini Response About the image**") 47 | 48 | 49 | # Extract text from all candidates (GitHub solution) 50 | text_parts = [] 51 | for candidate in response.candidates: 52 | text_parts.extend([part.text for part in candidate.content.parts]) 53 | full_text = ''.join(text_parts) # Join text parts for a cohesive response 54 | 55 | st.write(full_text) # Display the combined text 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/ecommerce_clustering_llm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | from sklearn.cluster import KMeans 6 | from sklearn.metrics import silhouette_score 7 | from mpl_toolkits.mplot3d import Axes3D 8 | sns.set_theme(color_codes=True) 9 | import os 10 | import pathlib 11 | import textwrap 12 | import google.generativeai as genai 13 | from IPython.display import display 14 | from IPython.display import Markdown 15 | import streamlit as st 16 | 17 | st.title("Ecommerce Segmentation Analysis") 18 | 19 | # Upload the CSV file 20 | uploaded_file = st.file_uploader("Upload CSV file:") 21 | 22 | # Check if the file is uploaded 23 | if uploaded_file is not None: 24 | # Read the CSV file into a Pandas DataFrame 25 | df = pd.read_csv(uploaded_file) 26 | 27 | # Show the DataFrame 28 | st.dataframe(df) 29 | 30 | # Get numeric columns for clustering 31 | numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns 32 | clustering_columns = st.multiselect("Select numeric columns for clustering:", numeric_columns) 33 | 34 | # Check if at least 3 columns are selected 35 | if len(clustering_columns) != 3: 36 | st.warning("Please select exactly 3 numeric columns for clustering.") 37 | else: 38 | # Display the selected columns 39 | st.subheader("Selected Columns for Clustering:") 40 | selected_data = df[clustering_columns] 41 | st.dataframe(selected_data) 42 | 43 | # Remove missing values 44 | selected_data.dropna(inplace=True) 45 | 46 | def visualize_clustering(df, selected_data): 47 | # Visualize the Elbow Method to find optimal clusters 48 | wcss = [] 49 | for i in range(1, 11): 50 | kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) 51 | kmeans.fit(selected_data) 52 | wcss.append(kmeans.inertia_) 53 | 54 | # Plot the Elbow Method 55 | st.subheader("Elbow Method to Determine Optimal Clusters") 56 | fig, ax = plt.subplots(figsize=(8, 5)) 57 | ax.plot(range(1, 11), wcss, marker='o') 58 | ax.set_title('Elbow Method') 59 | ax.set_xlabel('Number of Clusters') 60 | ax.set_ylabel('WCSS') # Within-Cluster Sum of Squares 61 | st.pyplot(fig) 62 | 63 | # Visualize Silhouette Score for different cluster numbers 64 | silhouette_scores = [] 65 | for n_clusters in range(2, 11): 66 | kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0) 67 | kmeans.fit(selected_data) 68 | silhouette_avg = silhouette_score(selected_data, kmeans.labels_) 69 | silhouette_scores.append(silhouette_avg) 70 | 71 | # Plot Silhouette Score 72 | st.subheader("Silhouette Score for Different Cluster Numbers") 73 | fig, ax = plt.subplots(figsize=(8, 5)) 74 | ax.plot(range(2, 11), silhouette_scores, marker='o') 75 | ax.set_title('Silhouette Score') 76 | ax.set_xlabel('Number of Clusters') 77 | ax.set_ylabel('Silhouette Score') 78 | st.pyplot(fig) 79 | 80 | # Apply KMeans clustering based on user-selected number of clusters 81 | num_clusters = st.slider("Select the number of clusters (2-10):", 2, 10, 3) 82 | kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0) 83 | cluster_labels = kmeans.fit_predict(selected_data) 84 | 85 | # Create a new DataFrame with the cluster labels 86 | clustered_df = pd.DataFrame(cluster_labels, columns=['cluster'], index=selected_data.index) 87 | 88 | # Concatenate the clustered_df with the original DataFrame 89 | df = pd.concat([df, clustered_df], axis=1) 90 | st.subheader("Clustered Dataset") 91 | st.dataframe(df) 92 | 93 | # Visualize clustering results in 3D plot 94 | fig = plt.figure(figsize=(10, 12)) 95 | ax = fig.add_subplot(111, projection='3d') 96 | scatter = ax.scatter(selected_data[clustering_columns[0]], 97 | selected_data[clustering_columns[1]], 98 | selected_data[clustering_columns[2]], 99 | c=cluster_labels, cmap='viridis', s=50) 100 | 101 | ax.set_xlabel(clustering_columns[0]) 102 | ax.set_ylabel(clustering_columns[1]) 103 | ax.set_zlabel(clustering_columns[2]) 104 | ax.set_title(f'3D Clustering (Cluster Amount = {num_clusters})') 105 | 106 | # Add a legend 107 | legend = ax.legend(*scatter.legend_elements(), title="Clusters") 108 | ax.add_artist(legend) 109 | 110 | # Show the 3D plot 111 | st.pyplot(fig) 112 | fig.savefig("plot8.png") 113 | 114 | # Visualize clustering 115 | visualize_clustering(df, selected_data) 116 | 117 | 118 | def to_markdown(text): 119 | text = text.replace('•', ' *') 120 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 121 | 122 | genai.configure(api_key="AIzaSyCY-mXpPt-J0oGRaSiPaeAyAVollbMxCF8") 123 | 124 | import PIL.Image 125 | 126 | img = PIL.Image.open("plot8.png") 127 | model = genai.GenerativeModel('gemini-pro-vision') 128 | response = model.generate_content(img) 129 | 130 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the each cluster colour. write the conclusion in English", img], stream=True) 131 | response.resolve() 132 | st.subheader("**Google Gemini Response About Data**") 133 | st.write(response.text) 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/fraud_analysis_llm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import streamlit as st 6 | sns.set_theme(color_codes=True) 7 | import textwrap 8 | import google.generativeai as genai 9 | from IPython.display import display 10 | from IPython.display import Markdown 11 | 12 | st.title("Fraud Analysis and Detection with Google Gen AI") 13 | 14 | # Upload the CSV file 15 | uploaded_file = st.file_uploader("Upload CSV file:") 16 | 17 | # Check if the file is uploaded 18 | if uploaded_file is not None: 19 | # Read the CSV file into a Pandas DataFrame 20 | df = pd.read_csv(uploaded_file) 21 | 22 | # Show the original DataFrame 23 | st.write("Original DataFrame:") 24 | st.dataframe(df) 25 | 26 | # Data Cleansing 27 | for col in df.columns: 28 | if 'value' in col or 'price' in col or 'cost' in col or 'amount' in col or 'Value' in col or 'Price' in col or 'Cost' in col or 'Amount' in col: 29 | df[col] = df[col].str.replace('$', '') 30 | df[col] = df[col].str.replace('£', '') 31 | df[col] = df[col].str.replace('€', '') 32 | # Remove non-numeric characters 33 | df[col] = df[col].replace('[^\d.-]', '', regex=True).astype(float) 34 | 35 | # Drop columns with null values more than 25% 36 | null_percentage = df.isnull().sum() / len(df) 37 | columns_to_drop = null_percentage[null_percentage > 0.25].index 38 | df.drop(columns=columns_to_drop, inplace=True) 39 | 40 | # Fill missing values below 25% with median 41 | for col in df.columns: 42 | if df[col].isnull().sum() > 0: # Check if there are missing values 43 | if null_percentage[col] <= 0.25: 44 | if df[col].dtype in ['float64', 'int64']: # Check if missing values are below 25% 45 | median_value = df[col].median() # Calculate median for the column 46 | df[col].fillna(median_value, inplace=True) 47 | 48 | # Convert object datatype columns to lowercase 49 | for col in df.columns: 50 | if df[col].dtype == 'object': # Check if datatype is object 51 | df[col] = df[col].str.lower() # Convert values to lowercase 52 | 53 | st.write("Cleaned Dataset") 54 | st.dataframe(df) 55 | 56 | 57 | 58 | st.write("**Countplot Barchart**") 59 | 60 | # Get the names of all columns with data type 'object' (categorical columns) excluding 'Country' 61 | cat_vars = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() > 1 and df[col].nunique() <= 10] 62 | 63 | # Create a figure with subplots 64 | num_cols = len(cat_vars) 65 | num_rows = (num_cols + 2) // 3 66 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 67 | axs = axs.flatten() 68 | 69 | # Create a countplot for the top 10 values of each categorical variable using Seaborn 70 | for i, var in enumerate(cat_vars): 71 | top_values = df[var].value_counts().head(10).index 72 | filtered_df = df.copy() 73 | filtered_df[var] = df[var].apply(lambda x: x if x in top_values else 'Other') 74 | sns.countplot(x=var, data=filtered_df, ax=axs[i]) 75 | axs[i].set_title(var) 76 | axs[i].tick_params(axis='x', rotation=90) 77 | 78 | # Remove any extra empty subplots if needed 79 | if num_cols < len(axs): 80 | for i in range(num_cols, len(axs)): 81 | fig.delaxes(axs[i]) 82 | 83 | # Adjust spacing between subplots 84 | fig.tight_layout() 85 | 86 | # Show plots using Streamlit 87 | st.pyplot(fig) 88 | fig.savefig("plot4.png") 89 | 90 | def to_markdown(text): 91 | text = text.replace('•', ' *') 92 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 93 | 94 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 95 | 96 | import PIL.Image 97 | 98 | img = PIL.Image.open("plot4.png") 99 | model = genai.GenerativeModel('gemini-pro-vision') 100 | response = model.generate_content(img) 101 | 102 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True) 103 | response.resolve() 104 | st.write("**Google Gemini Response About Data**") 105 | st.write(response.text) 106 | 107 | 108 | 109 | # Get the names of all columns with data type 'int' or 'float' 110 | num_vars = [col for col in df.select_dtypes(include=['int', 'float']).columns] 111 | 112 | # Create a figure with subplots 113 | num_cols = len(num_vars) 114 | num_rows = (num_cols + 2) // 3 115 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 116 | axs = axs.flatten() 117 | 118 | # Create a histplot for each numeric variable using Seaborn 119 | for i, var in enumerate(num_vars): 120 | sns.histplot(df[var], ax=axs[i], kde=True) 121 | axs[i].set_title(var) 122 | axs[i].set_xlabel('') 123 | 124 | # Remove any extra empty subplots if needed 125 | if num_cols < len(axs): 126 | for i in range(num_cols, len(axs)): 127 | fig.delaxes(axs[i]) 128 | 129 | # Adjust spacing between subplots 130 | fig.tight_layout() 131 | 132 | # Show plots using Streamlit 133 | st.pyplot(fig) 134 | fig.savefig("plot5.png") 135 | 136 | def to_markdown(text): 137 | text = text.replace('•', ' *') 138 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 139 | 140 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 141 | 142 | img = PIL.Image.open("plot5.png") 143 | model = genai.GenerativeModel('gemini-pro-vision') 144 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True) 145 | response.resolve() 146 | st.write("**Google Gemini Response About Data**") 147 | st.write(response.text) 148 | 149 | 150 | # Select target variable 151 | target_variable = st.selectbox("Select target variable:", df.columns) 152 | 153 | # Select columns for analysis 154 | columns_for_analysis = st.multiselect("Select columns for analysis:", [col for col in df.columns if col != target_variable]) 155 | 156 | # Process button 157 | if st.button("Process"): 158 | # Select the target variable and columns for analysis from the original DataFrame 159 | target_variable_data = df[target_variable] 160 | columns_for_analysis_data = df[columns_for_analysis] 161 | 162 | # Display target variable in a dataframe 163 | target_variable_df = df[[target_variable]] 164 | st.write("Target Variable DataFrame:") 165 | st.dataframe(target_variable_df) 166 | 167 | # Display columns for analysis in a dataframe 168 | columns_for_analysis_df = df[columns_for_analysis] 169 | st.write("Columns for Analysis DataFrame:") 170 | st.dataframe(columns_for_analysis_df) 171 | 172 | # Concatenate target variable and columns for analysis into a single DataFrame 173 | df = pd.concat([target_variable_data, columns_for_analysis_data], axis=1) 174 | 175 | # Drop columns with null values more than 25% 176 | null_percentage = df.isnull().sum() / len(df) 177 | columns_to_drop = null_percentage[null_percentage > 0.25].index 178 | df.drop(columns=columns_to_drop, inplace=True) 179 | 180 | # Fill missing values below 25% with median 181 | for col in df.columns: 182 | if df[col].isnull().sum() > 0: # Check if there are missing values 183 | if null_percentage[col] <= 0.25: 184 | if df[col].dtype in ['float64', 'int64']: # Check if missing values are below 25% 185 | median_value = df[col].median() # Calculate median for the column 186 | df[col].fillna(median_value, inplace=True) 187 | 188 | # Convert object datatype columns to lowercase 189 | for col in df.columns: 190 | if df[col].dtype == 'object': # Check if datatype is object 191 | df[col] = df[col].str.lower() # Convert values to lowercase 192 | 193 | st.write("Cleaned Dataset") 194 | st.dataframe(df) 195 | 196 | st.write("**Multiclass Barplot**") 197 | # Get the names of all columns with data type 'object' (categorical columns) 198 | cat_cols = df.columns.tolist() 199 | 200 | # Get the names of all columns with data type 'object' (categorical variables) 201 | cat_vars = df.select_dtypes(include=['object']).columns.tolist() 202 | 203 | # Exclude 'Country' from the list if it exists in cat_vars 204 | if target_variable in cat_vars: 205 | cat_vars.remove(target_variable) 206 | 207 | # Create a figure with subplots, but only include the required number of subplots 208 | num_cols = len(cat_vars) 209 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots 210 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 211 | axs = axs.flatten() 212 | 213 | # Create a count plot for each categorical variable 214 | for i, var in enumerate(cat_vars): 215 | top_categories = df[var].value_counts().nlargest(10).index 216 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)] # Exclude rows with NaN values in the variable 217 | sns.countplot(x=var, hue=target_variable, data=filtered_df, ax=axs[i]) 218 | axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90) 219 | 220 | # Remove any remaining blank subplots 221 | for i in range(num_cols, len(axs)): 222 | fig.delaxes(axs[i]) 223 | 224 | # Adjust spacing between subplots 225 | fig.tight_layout() 226 | 227 | # Show plot 228 | st.pyplot(fig) 229 | fig.savefig("plot6.png") 230 | 231 | def to_markdown(text): 232 | text = text.replace('•', ' *') 233 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 234 | 235 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 236 | 237 | import PIL.Image 238 | 239 | img = PIL.Image.open("plot6.png") 240 | model = genai.GenerativeModel('gemini-pro-vision') 241 | response = model.generate_content(img) 242 | 243 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True) 244 | response.resolve() 245 | st.write("**Google Gemini Response About Data**") 246 | st.write(response.text) 247 | 248 | 249 | st.write("**Multiclass Histplot**") 250 | # Get the names of all columns with data type 'object' (categorical columns) 251 | cat_cols = df.columns.tolist() 252 | 253 | # Get the names of all columns with data type 'int' 254 | int_vars = df.select_dtypes(include=['int', 'float']).columns.tolist() 255 | int_vars = [col for col in int_vars if col != target_variable] 256 | 257 | # Create a figure with subplots 258 | num_cols = len(int_vars) 259 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots 260 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows)) 261 | axs = axs.flatten() 262 | 263 | # Create a histogram for each integer variable with hue='Attrition' 264 | for i, var in enumerate(int_vars): 265 | top_categories = df[var].value_counts().nlargest(10).index 266 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)] 267 | sns.histplot(data=df, x=var, hue=target_variable, kde=True, ax=axs[i]) 268 | axs[i].set_title(var) 269 | 270 | # Remove any extra empty subplots if needed 271 | if num_cols < len(axs): 272 | for i in range(num_cols, len(axs)): 273 | fig.delaxes(axs[i]) 274 | 275 | # Adjust spacing between subplots 276 | fig.tight_layout() 277 | 278 | # Show plot 279 | st.pyplot(fig) 280 | fig.savefig("plot7.png") 281 | 282 | def to_markdown(text): 283 | text = text.replace('•', ' *') 284 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 285 | 286 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 287 | 288 | import PIL.Image 289 | 290 | img = PIL.Image.open("plot7.png") 291 | model = genai.GenerativeModel('gemini-pro-vision') 292 | response = model.generate_content(img) 293 | 294 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True) 295 | response.resolve() 296 | st.write("**Google Gemini Response About Data**") 297 | st.write(response.text) 298 | 299 | 300 | 301 | 302 | 303 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/indonesia-bert-sentiment-classification/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "indobenchmark/indobert-base-p1", 3 | "_num_labels": 5, 4 | "architectures": [ 5 | "BertForSequenceClassification" 6 | ], 7 | "attention_probs_dropout_prob": 0.1, 8 | "classifier_dropout": null, 9 | "directionality": "bidi", 10 | "gradient_checkpointing": false, 11 | "hidden_act": "gelu", 12 | "hidden_dropout_prob": 0.1, 13 | "hidden_size": 768, 14 | "id2label": { 15 | "0": "LABEL_0", 16 | "1": "LABEL_1", 17 | "2": "LABEL_2" 18 | }, 19 | "initializer_range": 0.02, 20 | "intermediate_size": 3072, 21 | "label2id": { 22 | "LABEL_0": 0, 23 | "LABEL_1": 1, 24 | "LABEL_2": 2 25 | }, 26 | "layer_norm_eps": 1e-12, 27 | "max_position_embeddings": 512, 28 | "model_type": "bert", 29 | "num_attention_heads": 12, 30 | "num_hidden_layers": 12, 31 | "output_past": true, 32 | "pad_token_id": 0, 33 | "pooler_fc_size": 768, 34 | "pooler_num_attention_heads": 12, 35 | "pooler_num_fc_layers": 3, 36 | "pooler_size_per_head": 128, 37 | "pooler_type": "first_token_transform", 38 | "position_embedding_type": "absolute", 39 | "problem_type": "single_label_classification", 40 | "torch_dtype": "float32", 41 | "transformers_version": "4.10.2", 42 | "type_vocab_size": 2, 43 | "use_cache": true, 44 | "vocab_size": 50000 45 | } 46 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/llmpandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import streamlit as st 3 | from langchain_groq.chat_models import ChatGroq 4 | from pandasai import SmartDataframe 5 | import os 6 | from PIL import Image 7 | import textwrap 8 | import google.generativeai as genai 9 | from IPython.display import display 10 | from IPython.display import Markdown 11 | import io 12 | import matplotlib.pyplot as plt 13 | 14 | # Load language model 15 | llm = ChatGroq( 16 | model_name="mixtral-8x7b-32768", 17 | api_key="YOUR_GROQ_API") 18 | 19 | def main(): 20 | st.title("Ask your CSV") 21 | 22 | # Allow user to upload CSV file 23 | uploaded_file = st.file_uploader("Upload CSV file", type=["csv"]) 24 | 25 | if uploaded_file is not None: 26 | # Read uploaded CSV file into pandas DataFrame 27 | data = pd.read_csv(uploaded_file) 28 | st.dataframe(data) 29 | 30 | # Convert DataFrame into SmartDataFrame 31 | df = SmartDataframe(data, config={"llm": llm}) 32 | 33 | # Add text box for user input 34 | question = st.text_input("Ask a question about the data:") 35 | 36 | if st.button("Ask"): 37 | if question: 38 | # Answer the user's question using the language model 39 | answer = df.chat(question) 40 | 41 | # Display the answer 42 | st.write("Answer:", answer) 43 | 44 | # Check if the answer is a visualization 45 | if isinstance(answer, str) and os.path.exists(answer): 46 | # Open the image file 47 | image = Image.open(answer) 48 | # Display the image 49 | st.image(image, caption="Visualization") 50 | 51 | # Save the figure as result.png 52 | plt.savefig("result.png") 53 | 54 | # Generate content using Google Gemini 55 | def to_markdown(text): 56 | text = text.replace('•', ' *') 57 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 58 | 59 | genai.configure(api_key="YOUR_GOOGLE_GEMINI_API") 60 | model = genai.GenerativeModel('gemini-pro-vision') 61 | 62 | img1 = Image.open("result.png") 63 | response = model.generate_content(["You are a Professional Data Analyst, give a conclusion and actionable insight based on the visualization", img1], stream=True) 64 | response.resolve() 65 | 66 | st.write("**Google Gemini Response About Data**") 67 | st.write(response.text) 68 | else: 69 | st.warning("No visualization found.") 70 | else: 71 | st.warning("Please ask a question.") 72 | 73 | if __name__ == "__main__": 74 | main() 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/pdf_comparer.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | import PyPDF2 5 | sns.set_theme(color_codes=True) 6 | import pandas as pd 7 | from io import StringIO 8 | import re 9 | import os 10 | import pathlib 11 | import textwrap 12 | import google.generativeai as genai 13 | from IPython.display import display 14 | from IPython.display import Markdown 15 | # import StemmerFactory class 16 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 17 | # create stemmer 18 | factory = StemmerFactory() 19 | stemmer = factory.create_stemmer() 20 | 21 | st.title("PDF Document Comparison") 22 | 23 | additional_stopwords = st.text_input("Enter additional stopwords (comma-separated)", value="") 24 | additional_stopwords = additional_stopwords.split(",") 25 | 26 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 27 | stop_factory = StopWordRemoverFactory() 28 | more_stopword = ['dengan', 'ia','bahwa','oleh','rp','undang','pasal','ayat','bab'] 29 | data = stop_factory.get_stop_words()+more_stopword + additional_stopwords 30 | stopword = stop_factory.create_stop_word_remover() 31 | 32 | # Function to read PDF and return string 33 | def read_pdf(file): 34 | # Create a PyPDF2 reader object 35 | pdf_reader = PyPDF2.PdfFileReader(file) 36 | 37 | # Extract text from all pages of PDF 38 | text = "" 39 | for page in range(pdf_reader.getNumPages()): 40 | text += pdf_reader.getPage(page).extractText() 41 | 42 | # Return the text as a string 43 | return text 44 | 45 | # Upload PDF file 46 | file = st.file_uploader("Upload a PDF file", type="pdf", key='text1') 47 | 48 | # If file is uploaded 49 | if file is not None: 50 | # Call read_pdf function to convert PDF to string 51 | text1 = read_pdf(file) 52 | 53 | 54 | # Function to read PDF and return string 55 | def read_pdf(file): 56 | # Create a PyPDF2 reader object 57 | pdf_reader = PyPDF2.PdfFileReader(file) 58 | 59 | # Extract text from all pages of PDF 60 | text = "" 61 | for page in range(pdf_reader.getNumPages()): 62 | text += pdf_reader.getPage(page).extractText() 63 | 64 | # Return the text as a string 65 | return text 66 | 67 | # Upload PDF file 68 | file = st.file_uploader("Upload a PDF file", type="pdf", key='text2') 69 | 70 | # If file is uploaded 71 | if file is not None: 72 | # Call read_pdf function to convert PDF to string 73 | text2 = read_pdf(file) 74 | 75 | 76 | if st.button("Process"): 77 | 78 | sentence1 = text1 79 | output1 = stemmer.stem(sentence1) 80 | 81 | hasil1 = re.sub(r"\d+", "", output1) 82 | hasil1 = re.sub(r'[^a-zA-Z\s]','',output1) 83 | 84 | pattern = re.compile(r'\b(' + r'|'.join(data) + r')\b\s*') 85 | hasil1 = pattern.sub('', hasil1) 86 | 87 | 88 | sentence2 = text2 89 | output2 = stemmer.stem(sentence2) 90 | 91 | hasil2 = re.sub(r"\d+", "", output2) 92 | hasil2 = re.sub(r'[^a-zA-Z\s]','',output2) 93 | 94 | pattern = re.compile(r'\b(' + r'|'.join(data) + r')\b\s*') 95 | hasil2 = pattern.sub('', hasil2) 96 | 97 | documents = [hasil1, hasil2] 98 | from sklearn.feature_extraction.text import CountVectorizer 99 | import pandas as pd 100 | 101 | # Create the Document Term Matrix 102 | count_vectorizer = CountVectorizer(stop_words='english') 103 | count_vectorizer = CountVectorizer() 104 | sparse_matrix = count_vectorizer.fit_transform(documents) 105 | from sklearn.metrics.pairwise import cosine_similarity 106 | cosine_sim = cosine_similarity(sparse_matrix, sparse_matrix) 107 | 108 | 109 | plt.rcParams.update({'font.size': 26}) 110 | 111 | heatmap = plt.figure(figsize =(5, 5)) 112 | sns.heatmap(cosine_sim, fmt='.2g', annot=True) 113 | 114 | 115 | import matplotlib.pyplot as plt 116 | from wordcloud import WordCloud 117 | 118 | # Create a WordCloud object 119 | wordcloud = WordCloud(min_font_size=3,max_words=200,width=1600,height=720, 120 | colormap = 'Set2', background_color='white').generate(hasil1) 121 | 122 | # Display the WordCloud using Matplotlib and Streamlit 123 | fig, ax = plt.subplots() 124 | ax.imshow(wordcloud, interpolation='bilinear') 125 | ax.axis('off') 126 | 127 | 128 | # Create a WordCloud object 129 | wordcloud = WordCloud(min_font_size=3,max_words=200,width=1600,height=720, 130 | colormap = 'Set2', background_color='white').generate(hasil2) 131 | 132 | # Display the WordCloud using Matplotlib and Streamlit 133 | fig2, ax = plt.subplots() 134 | ax.imshow(wordcloud, interpolation='bilinear') 135 | ax.axis('off') 136 | 137 | 138 | str=hasil1+hasil2 139 | # Create a WordCloud object 140 | wordcloud = WordCloud(min_font_size=3,max_words=200,width=1600,height=720, 141 | colormap = 'Set2', background_color='white').generate(str) 142 | 143 | # Display the WordCloud using Matplotlib and Streamlit 144 | fig3, ax = plt.subplots() 145 | ax.imshow(wordcloud, interpolation='bilinear') 146 | ax.axis('off') 147 | 148 | 149 | 150 | #bigram visualization 151 | import collections 152 | # Get bigrams 153 | words1 = hasil1.split() 154 | bigrams = list(zip(words1, words1[1:])) 155 | 156 | # Count bigrams 157 | bigram_counts = collections.Counter(bigrams) 158 | 159 | # Get top 10 bigram counts 160 | top_bigrams = dict(bigram_counts.most_common(10)) 161 | 162 | # Create bar chart 163 | plt.rcParams.update({'font.size': 12}) 164 | fig4, ax = plt.subplots() 165 | ax.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 166 | ax.set_xticks(range(len(top_bigrams))) 167 | ax.set_xticklabels(list(top_bigrams.keys())) 168 | ax.set_xlabel('Bigram Words') 169 | ax.set_ylabel('Count') 170 | ax.set_title('Top 10 Bigram Word Counts') 171 | plt.xticks(rotation=90) 172 | plt.figure(figsize =(15, 15)) 173 | 174 | 175 | 176 | 177 | #bigram visualization 178 | import collections 179 | # Get bigrams 180 | words2 = hasil2.split() 181 | bigrams = list(zip(words2, words2[1:])) 182 | 183 | # Count bigrams 184 | bigram_counts = collections.Counter(bigrams) 185 | 186 | # Get top 10 bigram counts 187 | top_bigrams = dict(bigram_counts.most_common(10)) 188 | 189 | # Create bar chart 190 | plt.rcParams.update({'font.size': 12}) 191 | fig5, ax = plt.subplots() 192 | ax.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 193 | ax.set_xticks(range(len(top_bigrams))) 194 | ax.set_xticklabels(list(top_bigrams.keys())) 195 | ax.set_xlabel('Bigram Words') 196 | ax.set_ylabel('Count') 197 | ax.set_title('Top 10 Bigram Word Counts') 198 | plt.xticks(rotation=90) 199 | plt.figure(figsize =(15, 15)) 200 | 201 | st.write("**Accuracy**") 202 | st.write(heatmap) 203 | 204 | st.write("**WordCloud Document 1**") 205 | st.pyplot(fig) 206 | 207 | st.write("**WordCloud Document 2**") 208 | st.pyplot(fig2) 209 | 210 | st.write("**WordCloud From Both Documents**") 211 | st.pyplot(fig3) 212 | 213 | st.write("**Bi-Gram for Document 1**") 214 | st.pyplot(fig4) 215 | 216 | st.write("**Bi-Gram for Document 2**") 217 | st.pyplot(fig5) 218 | 219 | 220 | def to_markdown(text): 221 | text = text.replace('•', ' *') 222 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 223 | 224 | # Configure genai with API key 225 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") 226 | 227 | # Instantiate the model 228 | model = genai.GenerativeModel('gemini-1.0-pro-latest') 229 | 230 | # Generate content 231 | response = model.generate_content(["Compare the simmilarities and give some conclusion between these 2 PDF Document : ", hasil1, "and", hasil2], stream=True) 232 | response.resolve() 233 | st.write("**Google Gemini Response About Data**") 234 | st.write(response.text) -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/pdf_document_analysis.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 4 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 5 | from wordcloud import WordCloud 6 | import PyPDF2 7 | import re 8 | from io import StringIO 9 | import plotly.express as px 10 | import pandas as pd 11 | import collections 12 | import seaborn as sns 13 | sns.set_theme(color_codes=True) 14 | import os 15 | import pathlib 16 | import textwrap 17 | import google.generativeai as genai 18 | from IPython.display import display 19 | from IPython.display import Markdown 20 | import PIL.Image 21 | import matplotlib.pyplot as plt 22 | 23 | st.title("NLP : PDF Document Analysis") 24 | st.set_option('deprecation.showPyplotGlobalUse', False) 25 | 26 | # Function to convert text to Markdown format 27 | def to_markdown(text): 28 | text = text.replace('•', ' *') 29 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True)) 30 | 31 | # Create stemmer 32 | factory = StemmerFactory() 33 | stemmer = factory.create_stemmer() 34 | 35 | # Create stopword remover 36 | stop_factory = StopWordRemoverFactory() 37 | more_stopword = ['dengan', 'ia', 'bahwa', 'oleh', 'rp', 'undang', 'pasal', 'ayat', 'bab'] 38 | data = stop_factory.get_stop_words() + more_stopword 39 | 40 | # User input for custom stopwords 41 | custom_stopwords = st.text_input("Enter custom stopwords (comma-separated):") 42 | if custom_stopwords: 43 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(",")] 44 | data.extend(custom_stopword_list) 45 | 46 | # Function to read PDF and return string 47 | def read_pdf(file): 48 | pdf_reader = PyPDF2.PdfFileReader(file) 49 | text = "" 50 | for page in range(pdf_reader.getNumPages()): 51 | text += pdf_reader.getPage(page).extractText() 52 | return text 53 | 54 | # Upload PDF file 55 | file = st.file_uploader("Upload a PDF file", type="pdf", key='text1') 56 | 57 | # If file is uploaded 58 | if file is not None: 59 | # Call read_pdf function to convert PDF to string 60 | text1 = read_pdf(file) 61 | 62 | # Stem and preprocess the text 63 | sentence1 = text1 64 | output1 = stemmer.stem(sentence1) 65 | hasil1 = re.sub(r"\d+", "", output1) 66 | hasil1 = re.sub(r'[^a-zA-Z\s]', '', hasil1) 67 | pattern = re.compile(r'\b(' + r'|'.join(data) + r')\b\s*') 68 | hasil1 = pattern.sub('', hasil1) 69 | 70 | # Create WordCloud 71 | wordcloud = WordCloud( 72 | min_font_size=3, max_words=200, width=800, height=400, 73 | colormap='Set2', background_color='white' 74 | ).generate(hasil1) 75 | 76 | # Save the WordCloud image 77 | wordcloud_file = "wordcloud.png" 78 | wordcloud.to_file(wordcloud_file) 79 | 80 | # Display the WordCloud using Streamlit 81 | st.subheader(f"Wordcloud Visualization") 82 | st.image(wordcloud_file) 83 | 84 | # Use Google Gemini API to generate content based on the uploaded image 85 | st.subheader("Google Gemini Response") 86 | 87 | # Load the image 88 | img = PIL.Image.open(wordcloud_file) 89 | 90 | # Configure and use the GenerativeAI model 91 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 92 | model = genai.GenerativeModel('gemini-pro-vision') 93 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True) 94 | response.resolve() 95 | 96 | # Display Gemini API response in Markdown format 97 | st.write(response.text) 98 | 99 | # Use Google Gemini API to generate content based on the WordCloud image 100 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8") 101 | model = genai.GenerativeModel('gemini-pro-vision') 102 | response_gemini = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True) 103 | response_gemini.resolve() 104 | 105 | # Bigram visualization 106 | # Get bigrams 107 | words1 = hasil1.split() 108 | # Get bigrams 109 | bigrams = list(zip(words1, words1[1:])) 110 | 111 | # Count bigrams 112 | bigram_counts = collections.Counter(bigrams) 113 | 114 | # Get top 10 bigram counts 115 | top_bigrams = dict(bigram_counts.most_common(10)) 116 | 117 | # Create bar chart 118 | plt.figure(figsize=(10, 7)) 119 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center') 120 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90) 121 | plt.xlabel('Bigram Words') 122 | plt.ylabel('Count') 123 | plt.title(f"Top 10 Bigram from PDF Document") 124 | 125 | # Add Gemini response text to the plot 126 | gemini_response_text = response_gemini.text 127 | 128 | # Save the entire plot as a PNG 129 | plt.tight_layout() 130 | plt.savefig("bigram_with_gemini_response.png") 131 | 132 | # Display the plot and Gemini response in Streamlit 133 | st.subheader("Bigram for PDF Document") 134 | st.image("bigram_with_gemini_response.png") 135 | st.subheader("Google Gemini Response") 136 | st.write(gemini_response_text) 137 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/table_scraper_analysis.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import pandas as pd 5 | import pathlib 6 | import textwrap 7 | import google.generativeai as genai 8 | from IPython.display import display 9 | from IPython.display import Markdown 10 | 11 | def scrape_tables(url): 12 | """ 13 | Scrapes all tables from a given URL and returns them as a list of DataFrames. 14 | 15 | Args: 16 | url: The URL of the webpage to scrape. 17 | 18 | Returns: 19 | A list of pandas DataFrames, each representing a scraped table. 20 | """ 21 | # Fetch the HTML content 22 | response = requests.get(url) 23 | response.raise_for_status() # Raise an error if the request fails 24 | 25 | # Parse the HTML content 26 | soup = BeautifulSoup(response.content, "html.parser") 27 | 28 | # Find all tables 29 | tables = soup.find_all("table") 30 | 31 | # Extract data and convert to DataFrames 32 | all_dataframes = [] 33 | for table in tables: 34 | # Extract rows from the table 35 | rows = table.find_all("tr") 36 | table_data = [] 37 | for row in rows: 38 | # Extract cells from each row 39 | cells = row.find_all(["th", "td"]) # Consider both headers and data cells 40 | row_data = [cell.text.strip() for cell in cells] # Extract text and strip whitespace 41 | table_data.append(row_data) 42 | 43 | # Check if there's data before creating a DataFrame 44 | if table_data: 45 | df = pd.DataFrame(table_data) 46 | all_dataframes.append(df) 47 | 48 | return all_dataframes 49 | 50 | def display_and_modify_tables(dataframes): 51 | """ 52 | Displays scraped DataFrames in Streamlit and allows user interaction for modifications. 53 | 54 | Args: 55 | dataframes: A list of pandas DataFrames containing scraped data. 56 | """ 57 | # Display all scraped tables (head) 58 | if dataframes: 59 | st.subheader("Scraped Tables:") 60 | for i, df in enumerate(dataframes): 61 | st.write(f"Table {i+1}") 62 | st.dataframe(df.head()) # Show only the head (first few rows) 63 | 64 | # Table selection for modification 65 | selected_table_index = st.selectbox("Select a Table to Modify", range(len(dataframes))) 66 | selected_df = dataframes[selected_table_index] 67 | 68 | # Display the full selected table 69 | st.subheader(f"Selected Table {selected_table_index+1}") 70 | st.dataframe(selected_df) 71 | 72 | # Row selection for removal with multi-select 73 | rows_to_remove = st.multiselect("Select rows to remove (0-based):", selected_df.index.tolist(), key="rows_to_remove") 74 | 75 | # Combined button for row removal with confirmation 76 | if st.button("Remove Selected Rows"): 77 | if rows_to_remove: # Check if any rows were selected 78 | try: 79 | selected_df.drop(rows_to_remove, axis=0, inplace=True) # Remove rows 80 | st.success(f"Selected rows removed successfully!") 81 | # Display the modified DataFrame 82 | st.subheader(f"Modified Table {selected_table_index+1}") 83 | st.dataframe(selected_df) 84 | except Exception as e: 85 | st.error(f"Error removing rows: {e}") 86 | 87 | # --- Google Gemini Integration --- 88 | # Convert the DataFrame to a string variable 89 | df_string = selected_df.to_string() 90 | 91 | # Configure genai with API key (replace with your actual key) 92 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") # Replace with your Google GenerativeAI API key 93 | 94 | model = genai.GenerativeModel('gemini-1.0-pro-latest') 95 | 96 | try: 97 | # Generate content with Gemini 98 | response = model.generate_content(["You are a Professional Data Analyst, Make a Summary and actionable insight based on the csv dataset here :", df_string], stream=True) 99 | response.resolve() 100 | st.write("**Google Gemini Response About Data**") 101 | st.write(response.text) 102 | except Exception as e: 103 | st.error(f"Error generating content with Google Gemini: {e}") 104 | 105 | 106 | # Streamlit app 107 | st.title("Table Scraper and Modifier App") 108 | url = st.text_input("Enter the URL to scrape:") 109 | if url: 110 | try: 111 | scraped_dataframes = scrape_tables(url) 112 | display_and_modify_tables(scraped_dataframes) 113 | except requests.exceptions.RequestException as e: 114 | st.error(f"An error occurred scraping the URL: {e}") 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /Streamlit-Web-Application-main/web_scrape.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import spacy 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | sns.set_theme(color_codes=True) 9 | 10 | st.title("Web Article Summarizer") 11 | 12 | target_url = st.text_input("Enter the target URL:") 13 | process_button = st.button("Scrape Text") # Button text adjusted 14 | 15 | def scrape_text(url): 16 | """Scrapes text from a website and returns the extracted text. 17 | 18 | Args: 19 | url: The URL of the website to scrape. 20 | 21 | Returns: 22 | The scraped text content as a string, or None if there's an error. 23 | """ 24 | 25 | if not url: # Check if URL is empty 26 | return None 27 | 28 | try: 29 | # Send HTTP request and parse HTML content 30 | response = requests.get(url) 31 | soup = BeautifulSoup(response.content, "html.parser") 32 | 33 | # Extract text based on your desired method (modify as needed) 34 | # Here, we're extracting text from all paragraphs 35 | paragraphs = soup.find_all("p") 36 | paragraph_text = [] 37 | for paragraph in paragraphs[:2]: # Limit to first 2 paragraphs 38 | paragraph_text.append(paragraph.text.strip()) 39 | 40 | # Combine text from all paragraphs (limited to first 2) 41 | all_paragraph_text = "\n".join(paragraph_text) 42 | 43 | return all_paragraph_text 44 | except Exception as e: 45 | st.error(f"Error scraping text: {e}") 46 | return None 47 | 48 | if process_button: # Only execute if button is clicked 49 | scraped_text = scrape_text(target_url) 50 | 51 | if scraped_text: 52 | st.success("Text scraped successfully!") 53 | st.subheader("Showing First Paragraphs of Article:") 54 | st.write(scraped_text) # Show only the first 2 paragraphs 55 | 56 | # Load English tokenizer, tagger, parser and NER 57 | nlp = spacy.load("en_core_web_sm") 58 | 59 | # Process the scraped text 60 | doc = nlp(scraped_text) 61 | 62 | # Analyze syntax - Extract Noun Phrases 63 | noun_phrases = [chunk.text for chunk in doc.noun_chunks] 64 | 65 | # Create DataFrame using Pandas (alternative to columns argument) 66 | noun_phrases_df = pd.DataFrame(noun_phrases, columns=["Noun Phrase"]) # Create DataFrame with Pandas 67 | 68 | # Display Noun Phrases in Streamlit table 69 | st.subheader("Noun Phrases:") 70 | st.dataframe(noun_phrases_df) 71 | 72 | # Analyze syntax - Extract Verbs 73 | verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"] 74 | 75 | # Create DataFrame for Verbs 76 | verbs_df = pd.DataFrame(verbs, columns=["Verb"]) 77 | 78 | # Display Verbs in Streamlit table 79 | st.subheader("Verbs:") 80 | st.dataframe(verbs_df) 81 | 82 | 83 | # Analyze Part-of-Speech Distribution 84 | pos_counts = {token.pos_: 0 for token in doc} 85 | for token in doc: 86 | pos_counts[token.pos_] += 1 87 | 88 | # Create Part-of-Speech Distribution Plot (using matplotlib) 89 | plt.figure(figsize=(8, 6)) 90 | plt.bar(pos_counts.keys(), pos_counts.values()) 91 | plt.xlabel("Part of Speech") 92 | plt.ylabel("Count") 93 | plt.xticks(rotation=45) 94 | plt.tight_layout() 95 | 96 | # Display Part-of-Speech Distribution Plot in Streamlit 97 | st.subheader("Part-of-Speech Distribution :") 98 | st.pyplot(plt) 99 | 100 | else: 101 | st.warning("No text found on the provided URL or an error occurred.") 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /Tableau/Dashboard 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Tableau/Dashboard 1.png --------------------------------------------------------------------------------