├── Apache Airflow
├── Welcome_DAG.py
├── dag_preprocessing.py
├── population_dag.py
└── wiki_dag.py
├── Classification Prediction Scikit Learn
├── Bank Customer Churn Prediction.ipynb
├── Bank Customer Churn.ipynb
├── Bank Loan Approval Exploratory Data Analysis.ipynb
├── Bank Turnover Prediction.ipynb
├── Customer Churn Prediction.ipynb
├── Customer Credit Risk Prediction.ipynb
├── Customer Personality Prediction to Boost Marketing Campaign.ipynb
├── Customer Satisfaction in Airline Prediction.ipynb
├── Fraudulent Claim on Cars Physical Damage Prediction.ipynb
├── Loan Default Prediction.ipynb
├── Loan Defaults Prediction.ipynb
├── Loan Prediction Analytics Vidhya Competition.ipynb
├── Loan Prediction Analytics Vidhya.ipynb
├── Loan Prediction Based on Customer Behavior.ipynb
├── Loan Prediction.ipynb
├── Predict CLTV of a customer.ipynb
├── Predict Customer Clicked Ads Classification.ipynb
├── Predict if a client will subscribe to a term deposit.ipynb
└── Travel Insurance Prediction.ipynb
├── End to End Data Science Project
├── End to End Brazilian E-Commerce Analysis.ipynb
└── End to End Customer Churn and Sales Analysis.ipynb
├── Flourish
└── README.md
├── KNIME Project
├── Knime Simple Data Preprocessing.JPG
└── README.md
├── Langchain LLM
├── LangChain_Chroma.ipynb
├── Langchain_Analyze_CSV.ipynb
├── Langchain_Analyze_PDF.ipynb
├── README.md
├── gemini_web_langchain.py
├── langchain_complete
│ ├── file.csv
│ ├── file.docx
│ ├── file.pdf
│ ├── file.pptx
│ ├── file.xlsx
│ └── langchain_streamlit.py
└── langchain_youtube.py
├── LlamaIndex
└── llamastreamlit.py
├── MySQL
├── Data Science Salary Query.sql
├── INNER JOIN COMBINATION.sql
├── README.md
├── SQL JOIN.sql
├── STUDENTS PERFORMANCE.sql
├── SUPERSTORE DATA ANALYSIS.sql
├── Sample - Superstore - Wanda.xlsx - Orders.csv
├── Students_Performance_mv.csv
├── VIRTUAL INTERNSHIP QUERIES.sql
├── ds_salaries.csv
├── exam score analysis.sql
├── excercise 1.sql
├── sakila-dvd-rental.sql
└── yellow_tlc_apr2022_1k.csv
├── Natural Language Processing
├── Anies_Sentiment_Analysis.ipynb
├── RUU_DPR_2020_2024.ipynb
├── Sentiment_Analisis_Prabowo.ipynb
└── emotion_streamlit.py
├── Power BI
├── pbi1.JPG
└── pbi2.JPG
├── PySpark
├── Insurance_Claim_Pyspark.ipynb
└── PySpark_Data_Preprocessing.ipynb
├── R Language
├── calculate.R
├── coba.R
└── portfolio.R
├── README.md
├── Regression Prediction Scikit Learn
├── Ford Car Price Prediction.ipynb
├── Honda Price Prediction.ipynb
├── House Price Prediction for Kaggle Competition.ipynb
├── House Rent Price Prediction.ipynb
├── Media Campaign Cost Prediction.ipynb
├── Medical Insurance Cost Prediction.ipynb
├── Melbourne Housing Price Prediction.ipynb
├── NY Rental Pricing Prediction.ipynb
├── Rain Prediction in Australian Coursera.ipynb
├── Salary Prediction.ipynb
├── Salary prediction based on country and race.ipynb
├── Software Industry Salary Prediction.ipynb
├── Sport Car Price Prediction.ipynb
├── USA Real Estate Price Prediction.ipynb
└── Used Vehicle Price Prediction.ipynb
├── Snowflake Cloud
├── README.md
├── Snowflake_Python_Connector.ipynb
├── Snowflake_Snowpark_Session.ipynb
└── Snowpark_Data_Pipeline_and_Transformation_Covid.ipynb
├── Streamlit-Web-Application-main
├── README.md
├── __pycache__
│ ├── flask.cpython-311.pyc
│ └── pandasai.cpython-311.pyc
├── auto_sentiment_analysis_twitter.py
├── chat_with_your_csv.py
├── cheatgpt.py
├── compare.py
├── complete_pack.py
├── diagnosis.py
├── ecommerce_clustering_llm.py
├── fraud_analysis_llm.py
├── indonesia-bert-sentiment-classification
│ └── config.json
├── llmpandas.py
├── pdf_comparer.py
├── pdf_document_analysis.py
├── table_scraper_analysis.py
└── web_scrape.py
├── Tableau
└── Dashboard 1.png
├── Tensorflow
├── ANTAM_Stock_Price_Prediction.ipynb
├── Classify_Mineral_Stone.ipynb
├── GOTO_Stock_Price.ipynb
├── Insurance_Claim_Fraud_with_GAN.ipynb
└── Insurance_Claim_Tensorflow.ipynb
└── site
└── en
└── gemini-api
└── docs
└── model-tuning
└── python.ipynb
/Apache Airflow/Welcome_DAG.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from airflow import DAG
4 | from airflow.operators.empty import EmptyOperator
5 |
6 | my_dag = DAG(
7 | dag_id="my_dag_name",
8 | start_date=datetime.datetime(2021, 1, 1),
9 | schedule="@daily",
10 | )
11 | EmptyOperator(task_id="task", dag=my_dag)
--------------------------------------------------------------------------------
/Apache Airflow/dag_preprocessing.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.providers.mysql.hooks.mysql import MySqlHook
3 | from datetime import datetime
4 | from airflow.operators.python import PythonOperator
5 |
6 | # Replace with your actual connection ID
7 | connection_id = 'mysql'
8 |
9 | def test_mysql_connection():
10 | try:
11 | # Get the connection from Airflow
12 | mysql_hook = MySqlHook(mysql_conn_id=connection_id)
13 |
14 | # Attempt a simple connection test (e.g., ping the server)
15 | with mysql_hook.get_conn() as conn:
16 | cursor = conn.cursor()
17 | cursor.execute("SELECT * FROM marketing.customer;")
18 | result = cursor.fetchone()
19 |
20 | if result:
21 | print("Connection to MySQL successful!")
22 | else:
23 | print("Connection test failed!")
24 |
25 | except Exception as e:
26 | print(f"Error connecting to MySQL: {e}")
27 |
28 | with DAG(dag_id='test_mysql_connection',
29 | start_date=datetime(2024, 4, 15),
30 | schedule_interval=None) as dag:
31 |
32 | test_connection_task = PythonOperator(
33 | task_id='test_connection',
34 | python_callable=test_mysql_connection
35 | )
36 |
--------------------------------------------------------------------------------
/Apache Airflow/population_dag.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.utils.dates import days_ago
3 | from airflow.operators.python_operator import PythonOperator
4 | from bs4 import BeautifulSoup # For web scraping
5 | import requests
6 |
7 | # Define default arguments
8 | default_args = {
9 | 'owner': 'airflow',
10 | 'start_date': days_ago(1), # Start yesterday
11 | 'schedule_interval': '@daily', # Run daily
12 | }
13 |
14 |
15 | def scrape_worldometer(ti): # Inject the TaskInstance object
16 | """
17 | Scrapes Worldometer website for population data and stores in XCom.
18 | """
19 | url = 'https://www.worldometers.info/world-population/'
20 | response = requests.get(url)
21 | soup = BeautifulSoup(response.content, 'html.parser')
22 |
23 | # Target elements using updated selectors
24 | births_today = soup.find('span', class_='rts-counter', rel='births_today').text.strip()
25 | deaths_today = soup.find('span', class_='rts-counter', rel='dth1s_today').text.strip()
26 |
27 | # Store data in XCom for retrieval by downstream tasks
28 | ti.xcom_push(
29 | key='worldometer_data',
30 | value={
31 | 'births_today': births_today,
32 | 'deaths_today': deaths_today
33 | }
34 | )
35 |
36 | # Define the DAG
37 | with DAG(
38 | dag_id='worldometer_scraper',
39 | default_args=default_args,
40 | ) as dag:
41 |
42 | # Scrape data task
43 | scrape_task = PythonOperator(
44 | task_id='scrape_worldometer',
45 | python_callable=scrape_worldometer, # Pass the function with TaskInstance injection
46 | )
47 |
--------------------------------------------------------------------------------
/Apache Airflow/wiki_dag.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.utils.dates import days_ago
3 | from airflow.operators.python_operator import PythonOperator
4 | from bs4 import BeautifulSoup
5 | import requests
6 |
7 | # Define default arguments
8 | default_args = {
9 | 'owner': 'airflow',
10 | 'start_date': days_ago(1), # Start yesterday
11 | 'schedule_interval': '@daily', # Run daily
12 | }
13 |
14 |
15 | def scrape_wiki_content(ti):
16 | """
17 | Scrapes content from Albert Einstein's Wikipedia page and stores it in XCom.
18 | """
19 | url = 'https://en.wikipedia.org/wiki/Albert_Einstein'
20 | response = requests.get(url)
21 | soup = BeautifulSoup(response.content, 'html.parser')
22 |
23 | # Target all paragraphs within the main content section (can be adjusted)
24 | content_elements = soup.find_all('p', class_=None) # Find all paragraphs without a class
25 |
26 | # Combine the text content of all paragraphs
27 | content_text = '\n'.join([p.get_text(strip=True) for p in content_elements])
28 |
29 | # Store the content in XCom for retrieval by downstream tasks
30 | ti.xcom_push(
31 | key='einstein_wiki_content',
32 | value=content_text
33 | )
34 |
35 |
36 | # Define the DAG
37 | with DAG(
38 | dag_id='wiki_einstein_scraper',
39 | default_args=default_args,
40 | ) as dag:
41 |
42 | # Scrape data task
43 | scrape_task = PythonOperator(
44 | task_id='scrape_wiki_content',
45 | python_callable=scrape_wiki_content, # Pass the function with TaskInstance injection
46 | )
47 |
--------------------------------------------------------------------------------
/Flourish/README.md:
--------------------------------------------------------------------------------
1 | # Link for Flourish Visualization
2 |
3 | 1. Loan Default Analysis : https://public.flourish.studio/story/2119154/
4 | 2. Superstore Sales Analysis : https://public.flourish.studio/story/2117963/
5 |
--------------------------------------------------------------------------------
/KNIME Project/Knime Simple Data Preprocessing.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/KNIME Project/Knime Simple Data Preprocessing.JPG
--------------------------------------------------------------------------------
/KNIME Project/README.md:
--------------------------------------------------------------------------------
1 | # KNIME Project
2 |
3 | 1. Simple Data Preprocessing explanation : https://www.linkedin.com/posts/michael-wiryaseputra_knime-datapreprocessing-datascience-activity-7184053754592129024-eqzs?utm_source=share&utm_medium=member_desktop
4 |
--------------------------------------------------------------------------------
/Langchain LLM/README.md:
--------------------------------------------------------------------------------
1 | # Langchain LLM
2 | This repository is for all of my Langchain project, modify this code if you want to use the real time poject
3 |
4 | ## 1. Langchain Analyze CSV
5 | This project is analyzing the CSV file with Langchain CSV Agent. The user will ask anything about the CSV Dataset and then Langchain will query it and answer based on the question
6 |
7 | ## 2. Langchain Analyze PDF
8 | This project analyzing the context of the PDF File with Langchain and then it will answer based on the users question
9 |
10 | ## 3. Langchain Analyze Youtube Video
11 | This project analyzing the context of the Youtube Video with Langchain and then it will answer based on the users question
12 |
13 | ## 4. Langchain Analyze Website
14 | This project analyzing the context of the Website with Langchain and then it will answer based on the users question
15 |
--------------------------------------------------------------------------------
/Langchain LLM/gemini_web_langchain.py:
--------------------------------------------------------------------------------
1 | from langchain_google_genai import ChatGoogleGenerativeAI
2 | from langchain_google_genai import GoogleGenerativeAIEmbeddings
3 | from langchain_community.document_loaders import WebBaseLoader
4 | from langchain.chains import StuffDocumentsChain
5 | from langchain.chains.llm import LLMChain
6 | from langchain.prompts import PromptTemplate
7 | import google.generativeai as genai
8 |
9 | #genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
10 |
11 | #Initialize Model
12 | llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
13 |
14 | #Load the blog
15 | loader = WebBaseLoader("https://thenewstack.io/the-building-blocks-of-llms-vectors-tokens-and-embeddings/")
16 | docs = loader.load()
17 |
18 | #Define the Summarize Chain
19 | template = """Write a concise summary of the following:
20 | "{text}"
21 | CONCISE SUMMARY:"""
22 |
23 | prompt = PromptTemplate.from_template(template)
24 |
25 | llm_chain = LLMChain(llm=llm, prompt=prompt)
26 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
27 |
28 | #Invoke Chain
29 | response=stuff_chain.invoke(docs)
30 | print(response["output_text"])
--------------------------------------------------------------------------------
/Langchain LLM/langchain_complete/file.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.docx
--------------------------------------------------------------------------------
/Langchain LLM/langchain_complete/file.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.pdf
--------------------------------------------------------------------------------
/Langchain LLM/langchain_complete/file.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.pptx
--------------------------------------------------------------------------------
/Langchain LLM/langchain_complete/file.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Langchain LLM/langchain_complete/file.xlsx
--------------------------------------------------------------------------------
/Langchain LLM/langchain_complete/langchain_streamlit.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | # Define functions for each page
4 | def langchain_pdf():
5 | st.title("Langchain PDF Text Analysis")
6 | from langchain_google_genai import ChatGoogleGenerativeAI
7 | from langchain_community.document_loaders import PyPDFLoader
8 | from langchain.chains import StuffDocumentsChain
9 | from langchain.chains.llm import LLMChain
10 | from langchain.prompts import PromptTemplate
11 | import asyncio
12 | import nest_asyncio
13 | nest_asyncio.apply()
14 |
15 | # Initialize Model
16 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
17 |
18 | # Input for PDF file
19 | uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
20 |
21 | # Input for the question
22 | question = st.text_input("Enter your question:")
23 |
24 | if st.button("Analyze"):
25 | if uploaded_file is not None:
26 | # Save the uploaded PDF file with the name "file.pdf"
27 | with open("file.pdf", "wb") as f:
28 | f.write(uploaded_file.getvalue())
29 |
30 | # Load the PDF file
31 | loader = PyPDFLoader("file.pdf")
32 | docs = loader.load_and_split()
33 |
34 | # Define the Summarize Chain
35 | template = question + """ Write a concise summary of the following:
36 | "{text}"
37 | CONCISE SUMMARY:"""
38 |
39 | prompt = PromptTemplate.from_template(template)
40 |
41 | llm_chain = LLMChain(llm=llm, prompt=prompt)
42 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
43 |
44 | # Invoke Chain
45 | response = stuff_chain.invoke(docs)
46 | summary = response["output_text"]
47 |
48 | # Display the summary
49 | st.header("Summary:")
50 | st.write(summary)
51 | else:
52 | st.error("Please upload a PDF file.")
53 |
54 |
55 | def langchain_doc():
56 | st.title("Langchain Microsoft Word File Analysis")
57 | from langchain_google_genai import ChatGoogleGenerativeAI
58 | from langchain_community.document_loaders import Docx2txtLoader
59 | from langchain.chains import StuffDocumentsChain
60 | from langchain.chains.llm import LLMChain
61 | from langchain.prompts import PromptTemplate
62 | import asyncio
63 | import nest_asyncio
64 | nest_asyncio.apply()
65 |
66 | # Initialize Model
67 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
68 |
69 | # Input for PDF file
70 | uploaded_file = st.file_uploader("Upload PDF", type=["docx"])
71 |
72 | # Input for the question
73 | question = st.text_input("Enter your question:")
74 |
75 | if st.button("Analyze"):
76 | if uploaded_file is not None:
77 | # Save the uploaded PDF file with the name "file.pdf"
78 | with open("file.docx", "wb") as f:
79 | f.write(uploaded_file.getvalue())
80 |
81 | # Load the PDF file
82 | loader = Docx2txtLoader("file.docx")
83 | docs = loader.load_and_split()
84 |
85 | # Define the Summarize Chain
86 | template = question + """ Write a concise summary of the following:
87 | "{text}"
88 | CONCISE SUMMARY:"""
89 |
90 | prompt = PromptTemplate.from_template(template)
91 |
92 | llm_chain = LLMChain(llm=llm, prompt=prompt)
93 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
94 |
95 | # Invoke Chain
96 | response = stuff_chain.invoke(docs)
97 | summary = response["output_text"]
98 |
99 | # Display the summary
100 | st.header("Summary:")
101 | st.write(summary)
102 | else:
103 | st.error("Please upload a Micosoft Word file.")
104 |
105 |
106 | def langchain_excel():
107 | st.title("Langchain Microsoft Excel File Analysis")
108 | from langchain_google_genai import ChatGoogleGenerativeAI
109 | from langchain_community.document_loaders import UnstructuredExcelLoader
110 | from langchain.chains import StuffDocumentsChain
111 | from langchain.chains.llm import LLMChain
112 | from langchain.prompts import PromptTemplate
113 | import asyncio
114 | import nest_asyncio
115 | nest_asyncio.apply()
116 |
117 | # Initialize Model
118 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
119 |
120 | # Input for PDF file
121 | uploaded_file = st.file_uploader("Upload PDF", type=["xlsx"])
122 |
123 | # Input for the question
124 | question = st.text_input("Enter your question:")
125 |
126 | if st.button("Analyze"):
127 | if uploaded_file is not None:
128 | # Save the uploaded PDF file with the name "file.pdf"
129 | with open("file.xlsx", "wb") as f:
130 | f.write(uploaded_file.getvalue())
131 |
132 | # Load the PDF file
133 | loader = UnstructuredExcelLoader("file.xlsx", mode="elements")
134 | docs = loader.load()
135 |
136 | # Define the Summarize Chain
137 | template = question + """ Write a concise summary of the following:
138 | "{text}"
139 | CONCISE SUMMARY:"""
140 |
141 | prompt = PromptTemplate.from_template(template)
142 |
143 | llm_chain = LLMChain(llm=llm, prompt=prompt)
144 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
145 |
146 | # Invoke Chain
147 | response = stuff_chain.invoke(docs)
148 | summary = response["output_text"]
149 |
150 | # Display the summary
151 | st.header("Summary:")
152 | st.write(summary)
153 | else:
154 | st.error("Please upload a Excel file.")
155 |
156 | def langchain_ppt():
157 | st.title("Langchain Microsoft Power Point File Analysis")
158 | from langchain_google_genai import ChatGoogleGenerativeAI
159 | from langchain_community.document_loaders import UnstructuredPowerPointLoader
160 | from langchain.chains import StuffDocumentsChain
161 | from langchain.chains.llm import LLMChain
162 | from langchain.prompts import PromptTemplate
163 | import asyncio
164 | import nest_asyncio
165 | nest_asyncio.apply()
166 |
167 | # Initialize Model
168 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
169 |
170 | # Input for PDF file
171 | uploaded_file = st.file_uploader("Upload PDF", type=["pptx"])
172 |
173 | # Input for the question
174 | question = st.text_input("Enter your question:")
175 |
176 | if st.button("Analyze"):
177 | if uploaded_file is not None:
178 | # Save the uploaded PDF file with the name "file.pdf"
179 | with open("file.pptx", "wb") as f:
180 | f.write(uploaded_file.getvalue())
181 |
182 | # Load the PDF file
183 | loader = UnstructuredPowerPointLoader("file.pptx", mode="elements")
184 | docs = loader.load_and_split()
185 |
186 | # Define the Summarize Chain
187 | template = question + """ Write a concise summary of the following:
188 | "{text}"
189 | CONCISE SUMMARY:"""
190 |
191 | prompt = PromptTemplate.from_template(template)
192 |
193 | llm_chain = LLMChain(llm=llm, prompt=prompt)
194 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
195 |
196 | # Invoke Chain
197 | response = stuff_chain.invoke(docs)
198 | summary = response["output_text"]
199 |
200 | # Display the summary
201 | st.header("Summary:")
202 | st.write(summary)
203 | else:
204 | st.error("Please upload a Excel file.")
205 |
206 | def langchain_csv():
207 | st.title("Langchain CSV File Analysis")
208 | from langchain_google_genai import ChatGoogleGenerativeAI
209 | from langchain_community.document_loaders.csv_loader import CSVLoader
210 | from langchain.chains import StuffDocumentsChain
211 | from langchain.chains.llm import LLMChain
212 | from langchain.prompts import PromptTemplate
213 | import asyncio
214 | import nest_asyncio
215 | nest_asyncio.apply()
216 |
217 | # Initialize Model
218 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
219 |
220 | # Input for PDF file
221 | uploaded_file = st.file_uploader("Upload PDF", type=["csv"])
222 |
223 | # Input for the question
224 | question = st.text_input("Enter your question:")
225 |
226 | if st.button("Analyze"):
227 | if uploaded_file is not None:
228 | # Save the uploaded PDF file with the name "file.pdf"
229 | with open("file.csv", "wb") as f:
230 | f.write(uploaded_file.getvalue())
231 |
232 | # Load the PDF file
233 | loader = CSVLoader(file_path="file.csv")
234 | docs = loader.load()
235 |
236 | # Define the Summarize Chain
237 | template = question + """ Write a concise summary of the following:
238 | "{text}"
239 | CONCISE SUMMARY:"""
240 |
241 | prompt = PromptTemplate.from_template(template)
242 |
243 | llm_chain = LLMChain(llm=llm, prompt=prompt)
244 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
245 |
246 | # Invoke Chain
247 | response = stuff_chain.invoke(docs)
248 | summary = response["output_text"]
249 |
250 | # Display the summary
251 | st.header("Summary:")
252 | st.write(summary)
253 | else:
254 | st.error("Please upload a CSV file.")
255 |
256 | def langchain_web():
257 | st.title("Langchain Web Content Analysis")
258 | from langchain_google_genai import ChatGoogleGenerativeAI
259 | from langchain_community.document_loaders import WebBaseLoader
260 | from langchain.chains import StuffDocumentsChain
261 | from langchain.chains.llm import LLMChain
262 | from langchain.prompts import PromptTemplate
263 | import asyncio
264 | import nest_asyncio
265 | nest_asyncio.apply()
266 |
267 | # Initialize Model
268 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
269 |
270 | # Input for article link
271 | article_link = st.text_input("Enter the link to the article:")
272 |
273 | # Input for the question
274 | question = st.text_input("Enter your question:")
275 |
276 | if st.button("Analyze"):
277 | if article_link.strip() == "":
278 | st.error("Please enter a link to the article.")
279 | else:
280 | # Load the article content
281 | loader = WebBaseLoader(article_link)
282 | docs = loader.load()
283 |
284 | # Define the Summarize Chain
285 | template = question + """ Write a concise summary of the following:
286 | "{text}"
287 | CONCISE SUMMARY:"""
288 |
289 | prompt = PromptTemplate.from_template(template)
290 |
291 | llm_chain = LLMChain(llm=llm, prompt=prompt)
292 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
293 |
294 | # Invoke Chain
295 | response = stuff_chain.invoke(docs)
296 | summary = response["output_text"]
297 |
298 | # Display the summary
299 | st.header("Summary:")
300 | st.write(summary)
301 |
302 | def langchain_youtube():
303 | st.title("Langchain Youtube Video Analysis")
304 | from langchain_google_genai import ChatGoogleGenerativeAI
305 | from langchain_community.document_loaders import YoutubeLoader
306 | from langchain.chains import StuffDocumentsChain
307 | from langchain.chains.llm import LLMChain
308 | from langchain.prompts import PromptTemplate
309 | import asyncio
310 | import nest_asyncio
311 | nest_asyncio.apply()
312 |
313 | # Initialize Model
314 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyAQLXJ6ROBzMycImPVp2jTlbB3zIpEWmhM")
315 |
316 | # Input for article link
317 | youtube_link = st.text_input("Enter the YouTube link:")
318 |
319 | # Input for the question
320 | question = st.text_input("Enter your question:")
321 |
322 | if st.button("Analyze"):
323 | if youtube_link.strip() == "":
324 | st.error("Please enter a link to the article.")
325 | else:
326 | # Load the article content
327 | loader = YoutubeLoader.from_youtube_url(
328 | youtube_link,
329 | add_video_info=True,
330 | language=["en", "id"],
331 | translation="en",
332 | )
333 | docs = loader.load()
334 |
335 | # Define the Summarize Chain
336 | template = question + """ Write a concise summary of the following:
337 | "{text}"
338 | CONCISE SUMMARY:"""
339 |
340 | prompt = PromptTemplate.from_template(template)
341 |
342 | llm_chain = LLMChain(llm=llm, prompt=prompt)
343 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
344 |
345 | # Invoke Chain
346 | response = stuff_chain.invoke(docs)
347 | summary = response["output_text"]
348 |
349 | # Display the summary
350 | st.header("Summary:")
351 | st.write(summary)
352 |
353 | # Set CSS to arrange buttons horizontally
354 | st.markdown(
355 | """
356 |
363 | """,
364 | unsafe_allow_html=True,
365 | )
366 |
367 | # Get the selected page
368 | selected_page = st.sidebar.radio(
369 | "Select Page",
370 | ("Langchain PDF Text Analysis",
371 | "Langchain Microsoft Word File Analysis",
372 | "Langchain Microsoft Excel File Analysis",
373 | "Langchain Microsoft Power Point File Analysis",
374 | "Langchain CSV File Analysis",
375 | "Langchain Web Content Analysis",
376 | "Langchain Youtube Video Analysis")
377 | )
378 |
379 | if selected_page == "Langchain PDF Text Analysis":
380 | langchain_pdf()
381 | elif selected_page == "Langchain Microsoft Word File Analysis":
382 | langchain_doc()
383 | elif selected_page == "Langchain Microsoft Excel File Analysis":
384 | langchain_excel()
385 | elif selected_page == "Langchain Microsoft Power Point File Analysis":
386 | langchain_ppt()
387 | elif selected_page == "Langchain CSV File Analysis":
388 | langchain_csv()
389 | elif selected_page == "Langchain Web Content Analysis":
390 | langchain_web()
391 | elif selected_page == "Langchain Youtube Video Analysis":
392 | langchain_youtube()
393 |
--------------------------------------------------------------------------------
/Langchain LLM/langchain_youtube.py:
--------------------------------------------------------------------------------
1 | from langchain_google_genai import ChatGoogleGenerativeAI
2 | from langchain_google_genai import GoogleGenerativeAIEmbeddings
3 | from langchain_community.document_loaders import YoutubeLoader
4 | from langchain.chains import StuffDocumentsChain
5 | from langchain.chains.llm import LLMChain
6 | from langchain.prompts import PromptTemplate
7 | import google.generativeai as genai
8 |
9 | #genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
10 |
11 | #Initialize Model
12 | llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
13 |
14 | #Load the blog
15 | loader = YoutubeLoader.from_youtube_url(
16 | "https://www.youtube.com/watch?v=bT8_sZlgOSI",
17 | add_video_info=True,
18 | language=["en", "id"],
19 | translation="en",
20 | )
21 | docs = loader.load()
22 |
23 | #Define the Summarize Chain
24 | template = """Write a concise summary of the following:
25 | "{text}"
26 | CONCISE SUMMARY:"""
27 |
28 | prompt = PromptTemplate.from_template(template)
29 |
30 | llm_chain = LLMChain(llm=llm, prompt=prompt)
31 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
32 |
33 | #Invoke Chain
34 | response=stuff_chain.invoke(docs)
35 | print(response["output_text"])
--------------------------------------------------------------------------------
/LlamaIndex/llamastreamlit.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from bs4 import BeautifulSoup
3 | from llama_index.core import Document, Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
4 | from llama_index.readers.web import SimpleWebPageReader
5 | from llama_index.vector_stores.chroma import ChromaVectorStore
6 | from llama_index.embeddings.gemini import GeminiEmbedding
7 | from llama_index.llms.gemini import Gemini
8 | from llama_index.core import PromptTemplate
9 | import chromadb
10 |
11 | # Set up Streamlit page title and instructions
12 | st.title("LlamaIndex + Google Gemini Web Article Question Answering")
13 | st.write("Please input the URL of the webpage you'd like to analyze, and ask your question about it.")
14 |
15 | # Input for the webpage URL
16 | url = st.text_input("Enter URL:")
17 |
18 | # Input for the question
19 | question = st.text_input("Ask your question:")
20 |
21 | # If both URL and question are provided, execute the code
22 | if url and question:
23 | # Load webpage content
24 | web_documents = SimpleWebPageReader().load_data([url])
25 | html_content = web_documents[0].text
26 |
27 | # Parse HTML content
28 | soup = BeautifulSoup(html_content, 'html.parser')
29 | p_tags = soup.findAll('p')
30 | text_content = ""
31 | for each in p_tags:
32 | text_content += each.text + "\n"
33 |
34 | # Convert to Document format
35 | documents = [Document(text=text_content)]
36 |
37 | # Initialize Gemini embedding model and LLAMA model
38 | gemini_api_key = "AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA"
39 | gemini_embedding_model = GeminiEmbedding(api_key=gemini_api_key, model_name="models/embedding-001")
40 | llm = Gemini(api_key=gemini_api_key, model_name="models/gemini-pro")
41 |
42 | # Create a client and a new collection
43 | client = chromadb.PersistentClient(path="./chroma_db")
44 | chroma_collection = client.get_or_create_collection("quickstart")
45 |
46 | # Create a vector store
47 | vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
48 |
49 | # Create a storage context
50 | storage_context = StorageContext.from_defaults(vector_store=vector_store)
51 |
52 | # Set Global settings
53 | Settings.llm = llm
54 | Settings.embed_model = gemini_embedding_model
55 |
56 | # Create an index from the documents
57 | index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
58 |
59 | # Define LLAMA prompt template
60 | template = (
61 | """ You are an assistant for question-answering tasks.
62 | Use the following context to answer the question.
63 | If you don't know the answer, just say that you don't know.
64 | Use five sentences maximum and keep the answer concise.\n
65 | Question: {query_str} \nContext: {context_str} \nAnswer:"""
66 | )
67 | llm_prompt = PromptTemplate(template)
68 |
69 | # Query data from the persisted index
70 | query_engine = index.as_query_engine(text_qa_template=llm_prompt)
71 | response = query_engine.query(question)
72 |
73 | # Display just the response text
74 | st.write("Answer:", response.response)
75 |
76 |
--------------------------------------------------------------------------------
/MySQL/Data Science Salary Query.sql:
--------------------------------------------------------------------------------
1 | /* SELECT DATA WHERE SALARY > 100000 */
2 | SELECT * FROM new_schema.ds_salaries
3 | WHERE salary > 100000;
4 |
5 | /* SELECT DATA WHERE SALARY > 100000, Company location in US, Order the salary from the largest */
6 | SELECT MyUnknownColumn, job_title, salary_in_usd, company_location FROM new_schema.ds_salaries
7 | WHERE salary_in_usd > 100000
8 | AND company_location = 'US'
9 | ORDER BY salary_in_usd DESC;
10 |
11 | /* Count the average Average Salary in USD group by job title and sort from the largest */
12 | SELECT AVG(salary_in_usd) AS AVERAGE_SALARY_IN_USD, job_title FROM new_schema.ds_salaries
13 | GROUP BY job_title
14 | ORDER BY AVERAGE_SALARY_IN_USD DESC;
--------------------------------------------------------------------------------
/MySQL/INNER JOIN COMBINATION.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT Z.customer_id, X.email, CONCAT (X.first_name,' ', X.last_name) AS full_name , Z.inventory_id
2 | FROM sakila.rental Z
3 | LEFT JOIN sakila.customer X
4 | ON Z.customer_id = X.customer_id
5 | ORDER BY inventory_id ASC;
6 |
7 | SELECT A.city_id, A.city, B.country
8 | FROM sakila.city A
9 | INNER JOIN sakila.country B
10 | ON A.city_id = B.country_id;
11 |
12 | SELECT A.film_id, A.actor_id, B.category_id
13 | FROM sakila.film_actor A
14 | RIGHT JOIN sakila.film_category B
15 | ON A.film_id = B.category_id;
16 |
--------------------------------------------------------------------------------
/MySQL/README.md:
--------------------------------------------------------------------------------
1 | # SQL Portofolio
2 | Here all of my SQL Portofolio created using MYSQL Workbench
3 |
--------------------------------------------------------------------------------
/MySQL/SQL JOIN.sql:
--------------------------------------------------------------------------------
1 | SELECT C.city_id, C.city_name, S.country_name
2 | FROM new_schema.`city` C
3 | JOIN new_schema.`country` S
4 | ON C.city_id = S.country_id;
5 |
6 | SELECT C.city_id, C.city_name, S.country_name
7 | FROM new_schema.`city` C
8 | LEFT JOIN new_schema.`country` S
9 | ON C.city_id = S.country_id;
10 |
11 | SELECT C.city_id, C.city_name, S.country_name
12 | FROM new_schema.`city` C
13 | RIGHT JOIN new_schema.`country` S
14 | ON C.city_id = S.country_id;
--------------------------------------------------------------------------------
/MySQL/STUDENTS PERFORMANCE.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM new_schema.students_performance_mv;
2 |
3 | /* COUNT RACE ETHNICITY WHERE test preparation course is completed and ORDER BY ASCENDING */
4 | SELECT race_ethnicity, COUNT(race_ethnicity) AS TOTAL FROM new_schema.students_performance_mv
5 | WHERE test_preparation_course = 'completed'
6 | GROUP BY race_ethnicity
7 | ORDER BY TOTAL;
8 |
9 | /* COUNT THE TOTAL SCORE EACH STUDENT AND THEN RANK THEM FROM HIGHEST*/
10 | SELECT gender, race_ethnicity, test_preparation_course, math_score + reading_score + writing_score AS TOTAL_SCORE
11 | FROM new_schema.students_performance_mv
12 | ORDER BY TOTAL_SCORE DESC;
13 |
14 | /* COUNT THE AVERAGE SCORE OF 3 TEST THEN COUNT THE AVERAGE AGAIN GROUP BY RACE ETHNICITY THEN ELIMINATE NULL VALUE AND test preparation course is completed */
15 | SELECT race_ethnicity, (AVG(math_score + reading_score + writing_score)/3) AS NILAI_3_PELAJARAN_RATA_RATA
16 | FROM new_schema.students_performance_mv
17 | WHERE test_preparation_course = 'completed'
18 | AND NOT race_ethnicity =''
19 | GROUP BY race_ethnicity
20 | ORDER BY NILAI_3_PELAJARAN_RATA_RATA DESC
21 |
--------------------------------------------------------------------------------
/MySQL/SUPERSTORE DATA ANALYSIS.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM new_schema.`sample - superstore - wanda.xlsx - orders`;
2 |
3 | /* SELECT AMOUNT OF CUSTOMER EACH REGION */
4 | SELECT Region, COUNT(Region) AS TOTAL_CUSTOMER FROM new_schema.`sample - superstore - wanda.xlsx - orders`
5 | GROUP BY Region
6 | ORDER BY TOTAL_CUSTOMER DESC;
7 |
8 | /* COUNT THE QUANTITY EACH REGION */
9 | SELECT Region, SUM(Quantity) AS TOTAL_QUANTITY FROM new_schema.`sample - superstore - wanda.xlsx - orders`
10 | GROUP BY Region
11 | ORDER BY TOTAL_QUANTITY DESC;
12 |
13 | /* COUNT SALES EACH REGION */
14 | SELECT Region, ROUND(SUM(Sales),2) AS TOTAL_SALES FROM new_schema.`sample - superstore - wanda.xlsx - orders`
15 | GROUP BY Region
16 | ORDER BY TOTAL_SALES DESC;
17 |
18 | /* FIRST BUY EACH REGION */
19 | SELECT Region, MIN(Order_Date) AS FIRST_BUYER_DATE FROM new_schema.`sample - superstore - wanda.xlsx - orders`
20 | GROUP BY Region
21 | ORDER BY FIRST_BUYER_DATE;
--------------------------------------------------------------------------------
/MySQL/VIRTUAL INTERNSHIP QUERIES.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT Z.SK_ID_CURR, X.SK_ID_CURR, Z.CODE_GENDER, X.NAME_CONTRACT_STATUS
2 | FROM vix.hci_application AS Z
3 | INNER JOIN vix.hci_previous AS X
4 | ON Z.SK_ID_CURR = X.SK_ID_CURR
5 | WHERE Z.TARGET = 0
6 | AND Z.CODE_GENDER = 'F'
7 | AND NOT Z.CNT_CHILDREN = 0
8 | AND X.NAME_CONTRACT_STATUS = 'Approved'
9 | ORDER BY X.SK_ID_CURR;
10 |
11 | SELECT DISTINCT Z.SK_ID_CURR, X.SK_ID_CURR, Z.CODE_GENDER, Z.CNT_CHILDREN, Z.TARGET, X.NAME_CONTRACT_TYPE, COUNT(*) AS TOTAL
12 | FROM vix.hci_application AS Z
13 | INNER JOIN vix.hci_previous AS X
14 | ON Z.SK_ID_CURR = X.SK_ID_CURR
15 | GROUP BY X.NAME_CONTRACT_TYPE
16 | /*HAVING Z.CODE_GENDER = 'F'
17 | AND Z.TARGET = 0
18 | AND NOT Z.CNT_CHILDREN = 0*/
19 | ORDER BY TOTAL DESC;
20 |
21 | SELECT DISTINCT Z.SK_ID_CURR, X.SK_ID_CURR, Z.CODE_GENDER, X.NAME_CONTRACT_STATUS
22 | FROM vix.hci_application AS Z
23 | INNER JOIN vix.hci_previous AS X
24 | ON Z.SK_ID_CURR = X.SK_ID_CURR
25 | WHERE Z.TARGET = 0
26 | AND Z.CODE_GENDER = 'F'
27 | AND NOT Z.CNT_CHILDREN = 0
28 | AND X.NAME_CONTRACT_STATUS = 'Approved';
29 |
--------------------------------------------------------------------------------
/MySQL/exam score analysis.sql:
--------------------------------------------------------------------------------
1 | SELECT gender, race_ethnicity, math_score, reading_score, writing_score, (math_score + reading_score + writing_score) AS total
2 | FROM exam.exams
3 | HAVING total > 200
4 | ORDER BY total DESC;
5 |
6 | SELECT race_ethnicity, AVG(math_score + reading_score + writing_score) AS AVERAGE
7 | FROM exam.exams
8 | GROUP BY race_ethnicity
9 | ORDER BY AVERAGE DESC;
10 |
11 | SELECT race_ethnicity, ROUND(AVG((math_score + reading_score + writing_score)/3),2) AS AVERAGE_SCORE
12 | FROM exam.exams
13 | GROUP BY race_ethnicity
14 | ORDER BY AVERAGE_SCORE DESC;
15 |
16 | SELECT race_ethnicity, ROUND(AVG(math_score),2) AS AVERAGE_MATH, ROUND(AVG(reading_score),2) AS AVERAGE_READING, ROUND(AVG(writing_score),2) AS AVERAGE_WRITING
17 | FROM exam.exams
18 | WHERE math_score > 70
19 | AND reading_score > 70
20 | AND writing_score > 70
21 | GROUP BY race_ethnicity;
22 |
23 |
--------------------------------------------------------------------------------
/MySQL/excercise 1.sql:
--------------------------------------------------------------------------------
1 | /* no 1 */
2 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k
3 | WHERE trip_distance < 3
4 | AND payment_type = 3;
5 |
6 | /* no 2 */
7 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k
8 | WHERE trip_distance < 3;
9 |
10 | /* no 3 */
11 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k
12 | WHERE trip_distance < 3
13 | AND passenger_count = 1;
14 |
15 | /* no 4 */
16 | SELECT VendorID, passenger_count, trip_distance, payment_type FROM new_schema.yellow_tlc_apr2022_1k
17 | WHERE trip_distance
18 | BETWEEN 1.50 AND 1.60;
19 |
--------------------------------------------------------------------------------
/MySQL/sakila-dvd-rental.sql:
--------------------------------------------------------------------------------
1 | /* CATEGORIZE FILM WITH ACTOR NAME AND FILM CATEGORY */
2 | SELECT Z.actor_id,
3 | CONCAT(Z.first_name," ",Z.last_name) AS actor_name,
4 | X.film_id, C.title AS film_title,
5 | B.name AS category
6 | FROM sakila.actor Z
7 | INNER JOIN sakila.film_actor X
8 | ON Z.actor_id = X.actor_id
9 | INNER JOIN sakila.film_text C
10 | ON X.film_id = C.film_id
11 | INNER JOIN sakila.film_category V
12 | ON C.film_id = V.film_id
13 | INNER JOIN sakila.category B
14 | ON V.category_id = B.category_id
15 | WHERE B.name = 'Action';
16 |
17 | /* CUSTOMER PAYMENT DATA WITH PRICE AND FILM TITLE */
18 | SELECT CONCAT(Z.first_name," ",Z.last_name) AS customer_name,
19 | X.amount, X.payment_date,
20 | C.inventory_id, C.rental_id,
21 | V.film_id, B.title
22 | FROM sakila.customer Z
23 | INNER JOIN sakila.payment X
24 | ON Z.customer_id = X.customer_id
25 | INNER JOIN sakila.rental C
26 | ON X.customer_id = C.customer_id
27 | INNER JOIN sakila.inventory V
28 | ON C.inventory_id = V.inventory_id
29 | INNER JOIN sakila.film_text B
30 | ON V.film_id = B.film_id;
31 |
32 | /* CUSTOMER ADDRESS AND IDENTITY */
33 | SELECT CONCAT(Z.first_name, " ", Z.last_name) AS name,
34 | Z.email, Z.address_id,
35 | X.address
36 | FROM sakila.customer Z
37 | INNER JOIN sakila.address X
38 | ON Z.address_id = X.address_id
--------------------------------------------------------------------------------
/Natural Language Processing/emotion_streamlit.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import seaborn as sns
4 | import matplotlib.pyplot as plt
5 | import re
6 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
7 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
8 | import collections
9 | import pathlib
10 | import textwrap
11 | import google.generativeai as genai
12 | from IPython.display import display
13 | from IPython.display import Markdown
14 |
15 | # Create stemmer
16 | factory = StemmerFactory()
17 | stemmer = factory.create_stemmer()
18 |
19 | # Create stopword remover
20 | stop_factory = StopWordRemoverFactory()
21 | more_stopword = ['dengan', 'ia', 'bahwa', 'oleh', 'rp', 'undang', 'pasal', 'ayat', 'bab']
22 | data = stop_factory.get_stop_words() + more_stopword
23 |
24 | # Define patterns for removal
25 | hyperlink_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
26 | number_pattern = re.compile(r'\b\d+\b')
27 | emoticon_pattern = re.compile(u'('
28 | u'\ud83c[\udf00-\udfff]|'
29 | u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
30 | u'[\u2600-\u26FF\u2700-\u27BF])+',
31 | re.UNICODE)
32 |
33 | st.title('Sentiment Analysis')
34 |
35 | uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
36 | custom_stopwords = st.text_input('Custom Stopwords (comma-separated)', '')
37 |
38 | if uploaded_file is not None and custom_stopwords:
39 | if st.button('Analyze'):
40 | df = pd.read_csv(uploaded_file)
41 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(',')]
42 | all_stopwords = data + custom_stopword_list
43 |
44 | df['cleaned_text'] = df['full_text'].str.replace(hyperlink_pattern, '')
45 | df['cleaned_text'] = df['cleaned_text'].str.replace(emoticon_pattern, '')
46 | df['cleaned_text'] = df['cleaned_text'].str.replace(number_pattern, '')
47 |
48 | for stopword in custom_stopword_list:
49 | df['cleaned_text'] = df['cleaned_text'].str.replace(stopword, '')
50 |
51 | df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(
52 | [stemmer.stem(word) for word in stop_factory.create_stop_word_remover().remove(x).split()
53 | if word.lower() not in all_stopwords]
54 | ))
55 |
56 | from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
57 |
58 | tokenizer = BertTokenizer.from_pretrained("indobert-emotion-classification")
59 | config = BertConfig.from_pretrained("indobert-emotion-classification")
60 | model = BertForSequenceClassification.from_pretrained("indobert-emotion-classification", config=config)
61 | from transformers import pipeline
62 |
63 | nlp = pipeline("text-classification", model="indobert-emotion-classification")
64 | results = df['cleaned_text'].apply(lambda x: nlp(x)[0])
65 | df['label'] = [res['label'] for res in results]
66 | df['score'] = [res['score'] for res in results]
67 |
68 | sentiment_counts = df['label'].value_counts()
69 |
70 | st.write("### Sentiment Distribution")
71 | st.bar_chart(sentiment_counts)
72 |
73 | st.write("### Analysis Results")
74 | st.write(df)
75 |
76 | anger_text = ' '.join(df[df['label'] == 'Anger']['cleaned_text'])
77 | happy_text = ' '.join(df[df['label'] == 'Happy']['cleaned_text'])
78 | neutral_text = ' '.join(df[df['label'] == 'Neutral']['cleaned_text'])
79 | fear_text = ' '.join(df[df['label'] == 'Fear']['cleaned_text'])
80 | sadness_text = ' '.join(df[df['label'] == 'Sadness']['cleaned_text'])
81 | love_text = ' '.join(df[df['label'] == 'Love']['cleaned_text'])
82 |
83 | # Bigrams Anger Sentiment
84 | words1 = anger_text.split()
85 | # Get bigrams
86 | bigrams = list(zip(words1, words1[1:]))
87 |
88 | # Count bigrams
89 | bigram_counts = collections.Counter(bigrams)
90 |
91 | # Get top 10 bigram counts
92 | top_bigrams = dict(bigram_counts.most_common(10))
93 |
94 | # Create bar chart
95 | plt.figure(figsize=(10, 7))
96 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
97 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
98 | plt.xlabel('Bigram Words')
99 | plt.ylabel('Count')
100 | plt.title(f"Top 10 Bigram for Anger Sentiment")
101 | # Save the entire plot as a PNG
102 | plt.tight_layout()
103 | plt.savefig("bigram_anger.png")
104 | st.subheader("Bigram for Anger Sentiment")
105 | st.image("bigram_anger.png")
106 |
107 | def to_markdown(text):
108 | text = text.replace('•', ' *')
109 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
110 |
111 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
112 |
113 | import PIL.Image
114 |
115 | img = PIL.Image.open("bigram_anger.png")
116 | model = genai.GenerativeModel('gemini-pro-vision')
117 | response = model.generate_content(img)
118 |
119 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img])
120 | response.resolve()
121 | st.write("**Google Gemini Response About Data**")
122 | st.write(response.text)
123 |
124 |
125 |
126 |
127 | # Bigrams Happy Sentiment
128 | words1 = happy_text.split()
129 | # Get bigrams
130 | bigrams = list(zip(words1, words1[1:]))
131 |
132 | # Count bigrams
133 | bigram_counts = collections.Counter(bigrams)
134 |
135 | # Get top 10 bigram counts
136 | top_bigrams = dict(bigram_counts.most_common(10))
137 |
138 | # Create bar chart
139 | plt.figure(figsize=(10, 7))
140 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
141 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
142 | plt.xlabel('Bigram Words')
143 | plt.ylabel('Count')
144 | plt.title(f"Top 10 Bigram for Happy Sentiment")
145 | # Save the entire plot as a PNG
146 | plt.tight_layout()
147 | plt.savefig("bigram_happy.png")
148 | st.subheader("Bigram for Happy Sentiment")
149 | st.image("bigram_happy.png")
150 |
151 | def to_markdown(text):
152 | text = text.replace('•', ' *')
153 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
154 |
155 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
156 |
157 | import PIL.Image
158 |
159 | img = PIL.Image.open("bigram_happy.png")
160 | model = genai.GenerativeModel('gemini-pro-vision')
161 | response = model.generate_content(img)
162 |
163 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img])
164 | response.resolve()
165 | st.write("**Google Gemini Response About Data**")
166 | st.write(response.text)
167 |
168 |
169 |
170 |
171 | # Bigrams Neutral Sentiment
172 | words1 = neutral_text.split()
173 | # Get bigrams
174 | bigrams = list(zip(words1, words1[1:]))
175 |
176 | # Count bigrams
177 | bigram_counts = collections.Counter(bigrams)
178 |
179 | # Get top 10 bigram counts
180 | top_bigrams = dict(bigram_counts.most_common(10))
181 |
182 | # Create bar chart
183 | plt.figure(figsize=(10, 7))
184 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
185 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
186 | plt.xlabel('Bigram Words')
187 | plt.ylabel('Count')
188 | plt.title(f"Top 10 Bigram for Neutral Sentiment")
189 | # Save the entire plot as a PNG
190 | plt.tight_layout()
191 | plt.savefig("bigram_neutral.png")
192 | st.subheader("Bigram for Neutral Sentiment")
193 | st.image("bigram_neutral.png")
194 |
195 | def to_markdown(text):
196 | text = text.replace('•', ' *')
197 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
198 |
199 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
200 |
201 | import PIL.Image
202 |
203 | img = PIL.Image.open("bigram_neutral.png")
204 | model = genai.GenerativeModel('gemini-pro-vision')
205 | response = model.generate_content(img)
206 |
207 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img])
208 | response.resolve()
209 | st.write("**Google Gemini Response About Data**")
210 | st.write(response.text)
211 |
212 |
213 |
214 |
215 | # Bigrams Fear Sentiment
216 | words1 = fear_text.split()
217 | # Get bigrams
218 | bigrams = list(zip(words1, words1[1:]))
219 |
220 | # Count bigrams
221 | bigram_counts = collections.Counter(bigrams)
222 |
223 | # Get top 10 bigram counts
224 | top_bigrams = dict(bigram_counts.most_common(10))
225 |
226 | # Create bar chart
227 | plt.figure(figsize=(10, 7))
228 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
229 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
230 | plt.xlabel('Bigram Words')
231 | plt.ylabel('Count')
232 | plt.title(f"Top 10 Bigram for Fear Sentiment")
233 | # Save the entire plot as a PNG
234 | plt.tight_layout()
235 | plt.savefig("bigram_fear.png")
236 | st.subheader("Bigram for Fear Sentiment")
237 | st.image("bigram_fear.png")
238 |
239 | def to_markdown(text):
240 | text = text.replace('•', ' *')
241 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
242 |
243 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
244 |
245 | import PIL.Image
246 |
247 | img = PIL.Image.open("bigram_fear.png")
248 | model = genai.GenerativeModel('gemini-pro-vision')
249 | response = model.generate_content(img)
250 |
251 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img])
252 | response.resolve()
253 | st.write("**Google Gemini Response About Data**")
254 | st.write(response.text)
255 |
256 |
257 |
258 |
259 | # Bigrams Sadness Sentiment
260 | words1 = sadness_text.split()
261 | # Get bigrams
262 | bigrams = list(zip(words1, words1[1:]))
263 |
264 | # Count bigrams
265 | bigram_counts = collections.Counter(bigrams)
266 |
267 | # Get top 10 bigram counts
268 | top_bigrams = dict(bigram_counts.most_common(10))
269 |
270 | # Create bar chart
271 | plt.figure(figsize=(10, 7))
272 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
273 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
274 | plt.xlabel('Bigram Words')
275 | plt.ylabel('Count')
276 | plt.title(f"Top 10 Bigram for Sadness Sentiment")
277 | # Save the entire plot as a PNG
278 | plt.tight_layout()
279 | plt.savefig("bigram_sadness.png")
280 | st.subheader("Bigram for Sadness Sentiment")
281 | st.image("bigram_sadness.png")
282 |
283 | def to_markdown(text):
284 | text = text.replace('•', ' *')
285 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
286 |
287 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
288 |
289 | import PIL.Image
290 |
291 | img = PIL.Image.open("bigram_sadness.png")
292 | model = genai.GenerativeModel('gemini-pro-vision')
293 | response = model.generate_content(img)
294 |
295 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img])
296 | response.resolve()
297 | st.write("**Google Gemini Response About Data**")
298 | st.write(response.text)
299 |
300 |
301 |
302 |
303 | # Bigrams Love Sentiment
304 | words1 = love_text.split()
305 | # Get bigrams
306 | bigrams = list(zip(words1, words1[1:]))
307 |
308 | # Count bigrams
309 | bigram_counts = collections.Counter(bigrams)
310 |
311 | # Get top 10 bigram counts
312 | top_bigrams = dict(bigram_counts.most_common(10))
313 |
314 | # Create bar chart
315 | plt.figure(figsize=(10, 7))
316 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
317 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
318 | plt.xlabel('Bigram Words')
319 | plt.ylabel('Count')
320 | plt.title(f"Top 10 Bigram for Love Sentiment")
321 | # Save the entire plot as a PNG
322 | plt.tight_layout()
323 | plt.savefig("bigram_love.png")
324 | st.subheader("Bigram for Love Sentiment")
325 | st.image("bigram_love.png")
326 |
327 | def to_markdown(text):
328 | text = text.replace('•', ' *')
329 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
330 |
331 | genai.configure(api_key="AIzaSyC0HGxZs1MI5Nfc_9v9C9b5b7vTSMSlITc")
332 |
333 | import PIL.Image
334 |
335 | img = PIL.Image.open("bigram_love.png")
336 | model = genai.GenerativeModel('gemini-pro-vision')
337 | response = model.generate_content(img)
338 |
339 | response = model.generate_content(["As a marketing consulant, I want to understand consumer insighst based on the chart and the market context so I can use the key findings to formulate actionable insights", img])
340 | response.resolve()
341 | st.write("**Google Gemini Response About Data**")
342 | st.write(response.text)
--------------------------------------------------------------------------------
/Power BI/pbi1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Power BI/pbi1.JPG
--------------------------------------------------------------------------------
/Power BI/pbi2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Power BI/pbi2.JPG
--------------------------------------------------------------------------------
/R Language/calculate.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | library(dplyr)
3 | library(forcats)
4 | library(quantmod)
5 | library(zoo)
6 | library(plotly)
7 |
8 | setwd("D:/RStudio/dataset")
9 | data3 <- read.csv("portfolio_data.csv")
10 |
11 | # Convert the "Date" column to the desired format "2013-05-01"
12 | data3$Date <- as.Date(data3$Date, format = "%m/%d/%Y")
13 |
14 | # Convert the data to an xts object using only the numeric columns
15 | prices_xts <- xts(data3[, -1], order.by = data3$Date)
16 |
17 | # Calculate the returns for each asset
18 | returns_xts <- Return.calculate(prices_xts)
19 |
20 | # Convert the "Date" column to the desired format "2013-05-01"
21 | data3$Date <- as.Date(data3$Date, format = "%m/%d/%Y")
22 |
23 | # Convert the data to an xts object using only the numeric columns
24 | prices_xts <- xts(data3[, -1], order.by = data3$Date)
25 |
26 | # Calculate the returns for each asset
27 | returns_xts <- Return.calculate(prices_xts)
28 |
29 | # Convert the returns back to a data frame
30 | returns_df <- data.frame(Date = index(returns_xts), coredata(returns_xts))
31 |
32 | # Create an interactive line chart for each asset's returns
33 | plot_ly(data = returns_df, x = ~Date) %>%
34 | add_lines(y = ~AMZN, name = "AMZN", line = list(color = "blue")) %>%
35 | add_lines(y = ~DPZ, name = "DPZ", line = list(color = "green")) %>%
36 | add_lines(y = ~BTC, name = "BTC", line = list(color = "orange")) %>%
37 | add_lines(y = ~NFLX, name = "NFLX", line = list(color = "red")) %>%
38 | layout(title = "Asset Returns Over Time",
39 | xaxis = list(title = "Date"),
40 | yaxis = list(title = "Returns"),
41 | showlegend = TRUE)
42 |
--------------------------------------------------------------------------------
/R Language/coba.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | library(dplyr)
3 | library(forcats)
4 |
5 | setwd("D:/RStudio/dataset")
6 | data <- read.csv("ruu_sql2.csv")
7 |
8 | # Create the countplot
9 | ggplot(data, aes(x = fct_infreq(sponsor))) +
10 | geom_bar(stat = "count")
11 |
12 | # Customized countplot
13 | ggplot(data, aes(x = fct_infreq(sponsor), fill = sponsor)) +
14 | geom_bar() +
15 | labs(title = "Countplot of Sponsor",
16 | x = "Sponsor",
17 | y = "Count") +
18 | theme_minimal() +
19 | theme(axis.text.x = element_text(angle = 45, hjust = 1))
20 |
21 |
22 |
--------------------------------------------------------------------------------
/R Language/portfolio.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | library(dplyr)
3 | library(forcats)
4 | library(quantmod)
5 | library(zoo)
6 |
7 | setwd("D:/RStudio/dataset")
8 | data1 <- read.csv("portfolio_data.csv")
9 |
10 | # Convert the "Date" column to the desired format "2013-05-01"
11 | data1$Date <- as.Date(data1$Date, format = "%m/%d/%Y")
12 |
13 | returns1 <- Return.portfolio(data1)
14 |
15 |
16 | # Convert the xts object to a dataframe and extract the Date column
17 | returns_df <- data.frame(Date = index(returns1), portfolio.returns = coredata(returns1))
18 |
19 | # Create a line chart
20 | ggplot(data = returns_df, aes(x = Date, y = portfolio.returns)) +
21 | geom_line() +
22 | labs(title = "Portfolio Returns Over Time",
23 | x = "Date",
24 | y = "Portfolio Returns")
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # All-of-Data-Science-Project
2 | This repository is for all of my Data Science Project and Portfolio
3 |
--------------------------------------------------------------------------------
/Snowflake Cloud/README.md:
--------------------------------------------------------------------------------
1 | # Snowflake Project
2 | This repository is for all of my Snowflake Cloud that created by myself. Use this code for your reference only, modify this code if you want to use the real time poject
3 |
4 | ## 1. Snowflake Python Connector
5 | This project is about connecting the Database from Snowflake into Jupyter Notebook with Snowflake Python connector. Then we can analyze the data using Python
6 |
7 | ## 2. Snowflake Snowpark Session
8 | This project is about to process data directly within the Snowflake cloud platform, allows the user to build data pipelines and applications for Snowflake in Python, Scala, or Java, allows the user to simplify data preprocessing tasks in Snowflake using familiar programming languages.
9 |
--------------------------------------------------------------------------------
/Snowflake Cloud/Snowflake_Python_Connector.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "# Install Required Library"
21 | ],
22 | "metadata": {
23 | "id": "Q5-cg7fitgua"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "colab": {
31 | "base_uri": "https://localhost:8080/"
32 | },
33 | "id": "vvcfAobHpIau",
34 | "outputId": "2bf0572d-1fc5-46de-e1e2-6ff217deb515"
35 | },
36 | "outputs": [
37 | {
38 | "output_type": "stream",
39 | "name": "stdout",
40 | "text": [
41 | "Collecting snowflake-connector-python\n",
42 | " Downloading snowflake_connector_python-3.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)\n",
43 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
44 | "\u001b[?25hCollecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)\n",
45 | " Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n",
46 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.0/105.0 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
47 | "\u001b[?25hRequirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (1.16.0)\n",
48 | "Requirement already satisfied: cryptography<43.0.0,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (42.0.5)\n",
49 | "Requirement already satisfied: pyOpenSSL<25.0.0,>=16.2.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (24.1.0)\n",
50 | "Requirement already satisfied: pyjwt<3.0.0 in /usr/lib/python3/dist-packages (from snowflake-connector-python) (2.3.0)\n",
51 | "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2023.4)\n",
52 | "Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2.31.0)\n",
53 | "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (24.0)\n",
54 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (3.3.2)\n",
55 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (3.6)\n",
56 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2024.2.2)\n",
57 | "Requirement already satisfied: typing-extensions<5,>=4.3 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (4.11.0)\n",
58 | "Requirement already satisfied: filelock<4,>=3.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (3.13.4)\n",
59 | "Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (2.4.0)\n",
60 | "Requirement already satisfied: platformdirs<5.0.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python) (4.2.0)\n",
61 | "Collecting tomlkit (from snowflake-connector-python)\n",
62 | " Downloading tomlkit-0.12.4-py3-none-any.whl (37 kB)\n",
63 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python) (2.22)\n",
64 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0->snowflake-connector-python) (2.0.7)\n",
65 | "Installing collected packages: asn1crypto, tomlkit, snowflake-connector-python\n",
66 | "Successfully installed asn1crypto-1.5.1 snowflake-connector-python-3.8.1 tomlkit-0.12.4\n",
67 | "Collecting snowflake-sqlalchemy\n",
68 | " Downloading snowflake_sqlalchemy-1.5.2-py3-none-any.whl (42 kB)\n",
69 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
70 | "\u001b[?25hRequirement already satisfied: snowflake-connector-python in /usr/local/lib/python3.10/dist-packages (from snowflake-sqlalchemy) (3.8.1)\n",
71 | "Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (from snowflake-sqlalchemy) (2.0.29)\n",
72 | "Requirement already satisfied: asn1crypto<2.0.0,>0.24.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (1.5.1)\n",
73 | "Requirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (1.16.0)\n",
74 | "Requirement already satisfied: cryptography<43.0.0,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (42.0.5)\n",
75 | "Requirement already satisfied: pyOpenSSL<25.0.0,>=16.2.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (24.1.0)\n",
76 | "Requirement already satisfied: pyjwt<3.0.0 in /usr/lib/python3/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2.3.0)\n",
77 | "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2023.4)\n",
78 | "Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2.31.0)\n",
79 | "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (24.0)\n",
80 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (3.3.2)\n",
81 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (3.6)\n",
82 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2024.2.2)\n",
83 | "Requirement already satisfied: typing-extensions<5,>=4.3 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (4.11.0)\n",
84 | "Requirement already satisfied: filelock<4,>=3.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (3.13.4)\n",
85 | "Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (2.4.0)\n",
86 | "Requirement already satisfied: platformdirs<5.0.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (4.2.0)\n",
87 | "Requirement already satisfied: tomlkit in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python->snowflake-sqlalchemy) (0.12.4)\n",
88 | "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy->snowflake-sqlalchemy) (3.0.3)\n",
89 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python->snowflake-sqlalchemy) (2.22)\n",
90 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0->snowflake-connector-python->snowflake-sqlalchemy) (2.0.7)\n",
91 | "Installing collected packages: snowflake-sqlalchemy\n",
92 | "Successfully installed snowflake-sqlalchemy-1.5.2\n",
93 | "Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (2.0.29)\n",
94 | "Requirement already satisfied: typing-extensions>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (4.11.0)\n",
95 | "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (3.0.3)\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "!pip install snowflake-connector-python\n",
101 | "!pip install snowflake-sqlalchemy\n",
102 | "!pip install sqlalchemy"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "source": [
108 | "# Make a Connection into your Snowflake account"
109 | ],
110 | "metadata": {
111 | "id": "HsKtW8CatmJl"
112 | }
113 | },
114 | {
115 | "cell_type": "code",
116 | "source": [
117 | "import snowflake.connector\n",
118 | "from sqlalchemy import create_engine"
119 | ],
120 | "metadata": {
121 | "id": "NM4SW0NspLPo"
122 | },
123 | "execution_count": null,
124 | "outputs": []
125 | },
126 | {
127 | "cell_type": "code",
128 | "source": [
129 | "# Snowflake connection parameters\n",
130 | "username = 'MAGICDASH91'\n",
131 | "password = '*************'\n",
132 | "account = 'tk11073.europe-west4.gcp'\n",
133 | "warehouse = 'COMPUTE_WH'\n",
134 | "database = 'DATABASE'\n",
135 | "schema = 'PUBLIC'\n",
136 | "\n",
137 | "# Establishing connection\n",
138 | "conn = snowflake.connector.connect(\n",
139 | " user=username,\n",
140 | " password=password,\n",
141 | " account=account,\n",
142 | " warehouse=warehouse,\n",
143 | " database=database,\n",
144 | " schema=schema\n",
145 | ")\n",
146 | "\n",
147 | "# Creating a cursor object\n",
148 | "cur = conn.cursor()"
149 | ],
150 | "metadata": {
151 | "id": "xFqBAYWWpNON"
152 | },
153 | "execution_count": null,
154 | "outputs": []
155 | },
156 | {
157 | "cell_type": "code",
158 | "source": [
159 | "import pandas as pd\n",
160 | "\n",
161 | "# Executing a modified query\n",
162 | "cur.execute(\"\"\"\n",
163 | " SELECT\n",
164 | " NAME,\n",
165 | " YEAR,\n",
166 | " SELLING_PRICE,\n",
167 | " CASE\n",
168 | " WHEN SELLING_PRICE < 100000 THEN 'CHEAP'\n",
169 | " WHEN SELLING_PRICE >= 100000 AND SELLING_PRICE <= 200000 THEN 'NORMAL'\n",
170 | " ELSE 'EXPENSIVE'\n",
171 | " END AS SELLING_PRICE_LABEL\n",
172 | " FROM\n",
173 | " BIKE_DETAILS\n",
174 | "\"\"\")\n",
175 | "\n",
176 | "# Fetching results\n",
177 | "rows = cur.fetchall()\n",
178 | "\n",
179 | "# Creating a Pandas DataFrame\n",
180 | "df = pd.DataFrame(rows, columns=['Name', 'Year', 'Selling_Price', 'Selling_Price_Label'])\n",
181 | "\n",
182 | "# Displaying the DataFrame\n",
183 | "df"
184 | ],
185 | "metadata": {
186 | "colab": {
187 | "base_uri": "https://localhost:8080/",
188 | "height": 424
189 | },
190 | "id": "kAL7Zo4-tA1w",
191 | "outputId": "2f46ddc6-e33f-441d-9822-5960ef26bf27"
192 | },
193 | "execution_count": null,
194 | "outputs": [
195 | {
196 | "output_type": "execute_result",
197 | "data": {
198 | "text/plain": [
199 | " Name Year Selling_Price \\\n",
200 | "0 Royal Enfield Classic 350 2019 175000 \n",
201 | "1 Honda Dio 2017 45000 \n",
202 | "2 Royal Enfield Classic Gunmetal Grey 2018 150000 \n",
203 | "3 Yamaha Fazer FI V 2.0 [2016-2018] 2015 65000 \n",
204 | "4 Yamaha SZ [2013-2014] 2011 20000 \n",
205 | "... ... ... ... \n",
206 | "1056 Activa 3g 2010 17000 \n",
207 | "1057 Honda CB twister 2012 16000 \n",
208 | "1058 Bajaj Discover 125 2013 15000 \n",
209 | "1059 Honda CB Shine 2009 12000 \n",
210 | "1060 Bajaj Pulsar 150 2008 10000 \n",
211 | "\n",
212 | " Selling_Price_Label \n",
213 | "0 NORMAL \n",
214 | "1 CHEAP \n",
215 | "2 NORMAL \n",
216 | "3 CHEAP \n",
217 | "4 CHEAP \n",
218 | "... ... \n",
219 | "1056 CHEAP \n",
220 | "1057 CHEAP \n",
221 | "1058 CHEAP \n",
222 | "1059 CHEAP \n",
223 | "1060 CHEAP \n",
224 | "\n",
225 | "[1061 rows x 4 columns]"
226 | ],
227 | "text/html": [
228 | "\n",
229 | "
\n",
230 | "
\n",
231 | "\n",
244 | "
\n",
245 | " \n",
246 | " \n",
247 | " | \n",
248 | " Name | \n",
249 | " Year | \n",
250 | " Selling_Price | \n",
251 | " Selling_Price_Label | \n",
252 | "
\n",
253 | " \n",
254 | " \n",
255 | " \n",
256 | " 0 | \n",
257 | " Royal Enfield Classic 350 | \n",
258 | " 2019 | \n",
259 | " 175000 | \n",
260 | " NORMAL | \n",
261 | "
\n",
262 | " \n",
263 | " 1 | \n",
264 | " Honda Dio | \n",
265 | " 2017 | \n",
266 | " 45000 | \n",
267 | " CHEAP | \n",
268 | "
\n",
269 | " \n",
270 | " 2 | \n",
271 | " Royal Enfield Classic Gunmetal Grey | \n",
272 | " 2018 | \n",
273 | " 150000 | \n",
274 | " NORMAL | \n",
275 | "
\n",
276 | " \n",
277 | " 3 | \n",
278 | " Yamaha Fazer FI V 2.0 [2016-2018] | \n",
279 | " 2015 | \n",
280 | " 65000 | \n",
281 | " CHEAP | \n",
282 | "
\n",
283 | " \n",
284 | " 4 | \n",
285 | " Yamaha SZ [2013-2014] | \n",
286 | " 2011 | \n",
287 | " 20000 | \n",
288 | " CHEAP | \n",
289 | "
\n",
290 | " \n",
291 | " ... | \n",
292 | " ... | \n",
293 | " ... | \n",
294 | " ... | \n",
295 | " ... | \n",
296 | "
\n",
297 | " \n",
298 | " 1056 | \n",
299 | " Activa 3g | \n",
300 | " 2010 | \n",
301 | " 17000 | \n",
302 | " CHEAP | \n",
303 | "
\n",
304 | " \n",
305 | " 1057 | \n",
306 | " Honda CB twister | \n",
307 | " 2012 | \n",
308 | " 16000 | \n",
309 | " CHEAP | \n",
310 | "
\n",
311 | " \n",
312 | " 1058 | \n",
313 | " Bajaj Discover 125 | \n",
314 | " 2013 | \n",
315 | " 15000 | \n",
316 | " CHEAP | \n",
317 | "
\n",
318 | " \n",
319 | " 1059 | \n",
320 | " Honda CB Shine | \n",
321 | " 2009 | \n",
322 | " 12000 | \n",
323 | " CHEAP | \n",
324 | "
\n",
325 | " \n",
326 | " 1060 | \n",
327 | " Bajaj Pulsar 150 | \n",
328 | " 2008 | \n",
329 | " 10000 | \n",
330 | " CHEAP | \n",
331 | "
\n",
332 | " \n",
333 | "
\n",
334 | "
1061 rows × 4 columns
\n",
335 | "
\n",
336 | "
\n",
543 | "
\n"
544 | ],
545 | "application/vnd.google.colaboratory.intrinsic+json": {
546 | "type": "dataframe",
547 | "variable_name": "df",
548 | "summary": "{\n \"name\": \"df\",\n \"rows\": 1061,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 276,\n \"samples\": [\n \"Hero Xtreme Sports\",\n \"Bajaj Avenger [2015]\",\n \"Bajaj Avenger Street 160\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Year\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4,\n \"min\": 1988,\n \"max\": 2020,\n \"num_unique_values\": 28,\n \"samples\": [\n 2012,\n 2003,\n 2020\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Selling_Price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 56304,\n \"min\": 5000,\n \"max\": 760000,\n \"num_unique_values\": 130,\n \"samples\": [\n 72000,\n 160000,\n 26000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Selling_Price_Label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"NORMAL\",\n \"CHEAP\",\n \"EXPENSIVE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
549 | }
550 | },
551 | "metadata": {},
552 | "execution_count": 4
553 | }
554 | ]
555 | }
556 | ]
557 | }
558 |
--------------------------------------------------------------------------------
/Snowflake Cloud/Snowflake_Snowpark_Session.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "# Install Required Library"
21 | ],
22 | "metadata": {
23 | "id": "xe3ioIeBOEVv"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "colab": {
31 | "base_uri": "https://localhost:8080/"
32 | },
33 | "id": "nUI_Fe6nLjC-",
34 | "outputId": "a9217013-5cd8-48f6-b4dd-2217f94882f3"
35 | },
36 | "outputs": [
37 | {
38 | "output_type": "stream",
39 | "name": "stdout",
40 | "text": [
41 | "Requirement already satisfied: snowflake-snowpark-python in /usr/local/lib/python3.10/dist-packages (1.14.0)\n",
42 | "Requirement already satisfied: setuptools>=40.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (67.7.2)\n",
43 | "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (0.43.0)\n",
44 | "Requirement already satisfied: snowflake-connector-python<4.0.0,>=3.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (3.8.1)\n",
45 | "Requirement already satisfied: typing-extensions<5.0.0,>=4.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (4.11.0)\n",
46 | "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (6.0.1)\n",
47 | "Requirement already satisfied: cloudpickle!=2.1.0,!=2.2.0,<=2.2.1,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-snowpark-python) (2.2.1)\n",
48 | "Requirement already satisfied: asn1crypto<2.0.0,>0.24.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (1.5.1)\n",
49 | "Requirement already satisfied: cffi<2.0.0,>=1.9 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (1.16.0)\n",
50 | "Requirement already satisfied: cryptography<43.0.0,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (42.0.5)\n",
51 | "Requirement already satisfied: pyOpenSSL<25.0.0,>=16.2.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (24.1.0)\n",
52 | "Requirement already satisfied: pyjwt<3.0.0 in /usr/lib/python3/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.3.0)\n",
53 | "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2023.4)\n",
54 | "Requirement already satisfied: requests<3.0.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.31.0)\n",
55 | "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (24.0)\n",
56 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (3.3.2)\n",
57 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (3.6)\n",
58 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2024.2.2)\n",
59 | "Requirement already satisfied: filelock<4,>=3.5 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (3.13.4)\n",
60 | "Requirement already satisfied: sortedcontainers>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.4.0)\n",
61 | "Requirement already satisfied: platformdirs<5.0.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (4.2.0)\n",
62 | "Requirement already satisfied: tomlkit in /usr/local/lib/python3.10/dist-packages (from snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (0.12.4)\n",
63 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi<2.0.0,>=1.9->snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.22)\n",
64 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0->snowflake-connector-python<4.0.0,>=3.6.0->snowflake-snowpark-python) (2.0.7)\n"
65 | ]
66 | }
67 | ],
68 | "source": [
69 | "pip install snowflake-snowpark-python"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "source": [
75 | "# Create Snowpark Session"
76 | ],
77 | "metadata": {
78 | "id": "P7tMNrSvToO9"
79 | }
80 | },
81 | {
82 | "cell_type": "code",
83 | "source": [
84 | "from snowflake.snowpark.session import Session\n",
85 | "\n",
86 | "username = 'MAGICDASH91'\n",
87 | "password = '*************'\n",
88 | "account = 'tk11073.europe-west4.gcp'\n",
89 | "warehouse = 'COMPUTE_WH'\n",
90 | "database = 'DATABASE'\n",
91 | "schema = 'PUBLIC'\n",
92 | "\n",
93 | "def snowpark_session_create():\n",
94 | " connection_params = {\n",
95 | " \"user\": username,\n",
96 | " \"password\": password,\n",
97 | " \"account\": account,\n",
98 | " \"warehouse\": warehouse,\n",
99 | " \"database\": database,\n",
100 | " \"schema\": schema\n",
101 | " }\n",
102 | "\n",
103 | " # Create the session\n",
104 | " session = Session.builder.configs(connection_params).create()\n",
105 | " return session\n",
106 | "\n",
107 | "demo_session = snowpark_session_create()"
108 | ],
109 | "metadata": {
110 | "id": "_UiOBJ79Mhmb"
111 | },
112 | "execution_count": null,
113 | "outputs": []
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "source": [
118 | "# Start Querying your data"
119 | ],
120 | "metadata": {
121 | "id": "F961ofKuT0h9"
122 | }
123 | },
124 | {
125 | "cell_type": "code",
126 | "source": [
127 | "df = demo_session.sql('SELECT * FROM CROSS_SELL')\n",
128 | "df.show()"
129 | ],
130 | "metadata": {
131 | "colab": {
132 | "base_uri": "https://localhost:8080/"
133 | },
134 | "id": "FlYG5jeXTrSr",
135 | "outputId": "e11a1247-a83e-4ebe-b0f6-7f53bb6d0ce2"
136 | },
137 | "execution_count": null,
138 | "outputs": [
139 | {
140 | "output_type": "stream",
141 | "name": "stdout",
142 | "text": [
143 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
144 | "|\"ID\" |\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\n",
145 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
146 | "|1 |Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |\n",
147 | "|2 |Male |76 |1 |3.0 |0 |1-2 Year |False |33536.0 |26.0 |183 |0 |\n",
148 | "|3 |Male |47 |1 |28.0 |0 |> 2 Years |True |38294.0 |26.0 |27 |1 |\n",
149 | "|4 |Male |21 |1 |11.0 |1 |< 1 Year |False |28619.0 |152.0 |203 |0 |\n",
150 | "|5 |Female |29 |1 |41.0 |1 |< 1 Year |False |27496.0 |152.0 |39 |0 |\n",
151 | "|6 |Female |24 |1 |33.0 |0 |< 1 Year |True |2630.0 |160.0 |176 |0 |\n",
152 | "|7 |Male |23 |1 |11.0 |0 |< 1 Year |True |23367.0 |152.0 |249 |0 |\n",
153 | "|8 |Female |56 |1 |28.0 |0 |1-2 Year |True |32031.0 |26.0 |72 |1 |\n",
154 | "|9 |Female |24 |1 |3.0 |1 |< 1 Year |False |27619.0 |152.0 |28 |0 |\n",
155 | "|10 |Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |\n",
156 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
157 | "\n"
158 | ]
159 | }
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "source": [
165 | "# Snowflake Transformation"
166 | ],
167 | "metadata": {
168 | "id": "3n2-ctOPg5HG"
169 | }
170 | },
171 | {
172 | "cell_type": "code",
173 | "source": [
174 | "import snowflake.snowpark.functions as F"
175 | ],
176 | "metadata": {
177 | "id": "GQI9PNEDVHBl"
178 | },
179 | "execution_count": null,
180 | "outputs": []
181 | },
182 | {
183 | "cell_type": "code",
184 | "source": [
185 | "# Show the Age where Age between 30 and 44\n",
186 | "df_age = df.filter(F.col('AGE').between(30,44))\n",
187 | "df_age.show()"
188 | ],
189 | "metadata": {
190 | "colab": {
191 | "base_uri": "https://localhost:8080/"
192 | },
193 | "id": "Q6aL1zaLhpyw",
194 | "outputId": "0f315e6c-d52c-4658-bd42-a4389bbf1631"
195 | },
196 | "execution_count": null,
197 | "outputs": [
198 | {
199 | "output_type": "stream",
200 | "name": "stdout",
201 | "text": [
202 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
203 | "|\"ID\" |\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\n",
204 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
205 | "|1 |Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |\n",
206 | "|10 |Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |\n",
207 | "|13 |Female |41 |1 |15.0 |1 |1-2 Year |False |31409.0 |14.0 |221 |0 |\n",
208 | "|16 |Male |37 |1 |6.0 |0 |1-2 Year |True |2630.0 |156.0 |147 |1 |\n",
209 | "|19 |Male |42 |1 |28.0 |0 |1-2 Year |True |33667.0 |124.0 |158 |0 |\n",
210 | "|24 |Male |44 |1 |28.0 |0 |1-2 Year |True |41852.0 |163.0 |60 |0 |\n",
211 | "|25 |Male |34 |1 |15.0 |1 |1-2 Year |False |38111.0 |152.0 |180 |0 |\n",
212 | "|35 |Female |32 |1 |30.0 |1 |< 1 Year |False |27638.0 |152.0 |169 |0 |\n",
213 | "|36 |Male |41 |1 |36.0 |1 |1-2 Year |False |30039.0 |124.0 |88 |0 |\n",
214 | "|41 |Male |30 |1 |30.0 |0 |< 1 Year |True |24550.0 |124.0 |45 |0 |\n",
215 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
216 | "\n"
217 | ]
218 | }
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "source": [
224 | "# Create an aggregation about Average ANNUAL_PREMIUM for every VEHICLE_AGE\n",
225 | "avg_ann = df.group_by('VEHICLE_AGE').agg(F.avg('ANNUAL_PREMIUM').alias('AVERAGE_ANNUAL_PREMIUM'))\n",
226 | "avg_ann.show()"
227 | ],
228 | "metadata": {
229 | "colab": {
230 | "base_uri": "https://localhost:8080/"
231 | },
232 | "id": "KrAhFfgVkLu1",
233 | "outputId": "420c4b76-1415-49f9-ccd6-06bb968ec3b6"
234 | },
235 | "execution_count": null,
236 | "outputs": [
237 | {
238 | "output_type": "stream",
239 | "name": "stdout",
240 | "text": [
241 | "--------------------------------------------\n",
242 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM\" |\n",
243 | "--------------------------------------------\n",
244 | "|> 2 Years |35654.4994690 |\n",
245 | "|1-2 Year |30523.5821203 |\n",
246 | "|< 1 Year |30119.5520251 |\n",
247 | "--------------------------------------------\n",
248 | "\n"
249 | ]
250 | }
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "source": [
256 | "# Simple Multiplication\n",
257 | "mul_col = df.with_column(\"AGE & VINTAGE\", F.col('AGE') * F.col('VINTAGE'))\n",
258 | "mul_col.show()"
259 | ],
260 | "metadata": {
261 | "colab": {
262 | "base_uri": "https://localhost:8080/"
263 | },
264 | "id": "dWGuzthgoXmu",
265 | "outputId": "8e5a7484-934c-4360-9a2d-5c74ec285fdf"
266 | },
267 | "execution_count": null,
268 | "outputs": [
269 | {
270 | "output_type": "stream",
271 | "name": "stdout",
272 | "text": [
273 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
274 | "|\"ID\" |\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\"AGE & VINTAGE\" |\n",
275 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
276 | "|1 |Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |9548 |\n",
277 | "|2 |Male |76 |1 |3.0 |0 |1-2 Year |False |33536.0 |26.0 |183 |0 |13908 |\n",
278 | "|3 |Male |47 |1 |28.0 |0 |> 2 Years |True |38294.0 |26.0 |27 |1 |1269 |\n",
279 | "|4 |Male |21 |1 |11.0 |1 |< 1 Year |False |28619.0 |152.0 |203 |0 |4263 |\n",
280 | "|5 |Female |29 |1 |41.0 |1 |< 1 Year |False |27496.0 |152.0 |39 |0 |1131 |\n",
281 | "|6 |Female |24 |1 |33.0 |0 |< 1 Year |True |2630.0 |160.0 |176 |0 |4224 |\n",
282 | "|7 |Male |23 |1 |11.0 |0 |< 1 Year |True |23367.0 |152.0 |249 |0 |5727 |\n",
283 | "|8 |Female |56 |1 |28.0 |0 |1-2 Year |True |32031.0 |26.0 |72 |1 |4032 |\n",
284 | "|9 |Female |24 |1 |3.0 |1 |< 1 Year |False |27619.0 |152.0 |28 |0 |672 |\n",
285 | "|10 |Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |2560 |\n",
286 | "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
287 | "\n"
288 | ]
289 | }
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "source": [
295 | "# For other Snowpark Functions you can check here :\n",
296 | "\n",
297 | "https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.functions.function"
298 | ],
299 | "metadata": {
300 | "id": "5RvyahLxno7x"
301 | }
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "source": [
306 | "# Alter (Editing) Existing Dataframe"
307 | ],
308 | "metadata": {
309 | "id": "MqHqX9p_rzs6"
310 | }
311 | },
312 | {
313 | "cell_type": "code",
314 | "source": [
315 | "# Rename \"AVERAGE_ANNUAL_PREMIUM\" to be \"AVERAGE_ANNUAL_PREMIUM_ALL_AGE\"\n",
316 | "avg_ann = avg_ann.with_column_renamed(F.col('AVERAGE_ANNUAL_PREMIUM'), 'AVERAGE_ANNUAL_PREMIUM_ALL_AGE')\n",
317 | "avg_ann.show()"
318 | ],
319 | "metadata": {
320 | "colab": {
321 | "base_uri": "https://localhost:8080/"
322 | },
323 | "id": "_i19UKLOsFnp",
324 | "outputId": "896d0273-da93-4764-93c2-dd858cd3d66d"
325 | },
326 | "execution_count": null,
327 | "outputs": [
328 | {
329 | "output_type": "stream",
330 | "name": "stdout",
331 | "text": [
332 | "----------------------------------------------------\n",
333 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_ALL_AGE\" |\n",
334 | "----------------------------------------------------\n",
335 | "|> 2 Years |35654.4994690 |\n",
336 | "|1-2 Year |30523.5821203 |\n",
337 | "|< 1 Year |30119.5520251 |\n",
338 | "----------------------------------------------------\n",
339 | "\n"
340 | ]
341 | }
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "source": [
347 | "# Snowflake Drop column"
348 | ],
349 | "metadata": {
350 | "id": "pCfM2Bxytgd7"
351 | }
352 | },
353 | {
354 | "cell_type": "code",
355 | "source": [
356 | "df.drop(\"ID\").show()"
357 | ],
358 | "metadata": {
359 | "colab": {
360 | "base_uri": "https://localhost:8080/"
361 | },
362 | "id": "Wv78trnDtic7",
363 | "outputId": "0879ffc1-ad0e-4622-a74a-a422da61a915"
364 | },
365 | "execution_count": null,
366 | "outputs": [
367 | {
368 | "output_type": "stream",
369 | "name": "stdout",
370 | "text": [
371 | "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
372 | "|\"GENDER\" |\"AGE\" |\"DRIVING_LICENSE\" |\"REGION_CODE\" |\"PREVIOUSLY_INSURED\" |\"VEHICLE_AGE\" |\"VEHICLE_DAMAGE\" |\"ANNUAL_PREMIUM\" |\"POLICY_SALES_CHANNEL\" |\"VINTAGE\" |\"RESPONSE\" |\n",
373 | "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
374 | "|Male |44 |1 |28.0 |0 |> 2 Years |True |40454.0 |26.0 |217 |1 |\n",
375 | "|Male |76 |1 |3.0 |0 |1-2 Year |False |33536.0 |26.0 |183 |0 |\n",
376 | "|Male |47 |1 |28.0 |0 |> 2 Years |True |38294.0 |26.0 |27 |1 |\n",
377 | "|Male |21 |1 |11.0 |1 |< 1 Year |False |28619.0 |152.0 |203 |0 |\n",
378 | "|Female |29 |1 |41.0 |1 |< 1 Year |False |27496.0 |152.0 |39 |0 |\n",
379 | "|Female |24 |1 |33.0 |0 |< 1 Year |True |2630.0 |160.0 |176 |0 |\n",
380 | "|Male |23 |1 |11.0 |0 |< 1 Year |True |23367.0 |152.0 |249 |0 |\n",
381 | "|Female |56 |1 |28.0 |0 |1-2 Year |True |32031.0 |26.0 |72 |1 |\n",
382 | "|Female |24 |1 |3.0 |1 |< 1 Year |False |27619.0 |152.0 |28 |0 |\n",
383 | "|Female |32 |1 |6.0 |1 |< 1 Year |False |28771.0 |152.0 |80 |0 |\n",
384 | "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n",
385 | "\n"
386 | ]
387 | }
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "source": [
393 | "# Join the table"
394 | ],
395 | "metadata": {
396 | "id": "HZ39g44VtVOC"
397 | }
398 | },
399 | {
400 | "cell_type": "code",
401 | "source": [
402 | "# We have to make 2nd aggregation dataframe\n",
403 | "avg_ann2 = df_age.group_by('VEHICLE_AGE').agg(F.avg('ANNUAL_PREMIUM').alias('AVERAGE_ANNUAL_PREMIUM_30_TO_44'))\n",
404 | "avg_ann2.show()"
405 | ],
406 | "metadata": {
407 | "colab": {
408 | "base_uri": "https://localhost:8080/"
409 | },
410 | "id": "FHO9Xi03tXCD",
411 | "outputId": "cf36e6a0-91fc-4458-b7fb-a421c5ac3725"
412 | },
413 | "execution_count": null,
414 | "outputs": [
415 | {
416 | "output_type": "stream",
417 | "name": "stdout",
418 | "text": [
419 | "-----------------------------------------------------\n",
420 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_30_TO_44\" |\n",
421 | "-----------------------------------------------------\n",
422 | "|> 2 Years |33157.8273078 |\n",
423 | "|< 1 Year |27853.8153776 |\n",
424 | "|1-2 Year |28789.0972791 |\n",
425 | "-----------------------------------------------------\n",
426 | "\n"
427 | ]
428 | }
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "source": [
434 | "join_df = avg_ann.join(avg_ann2, \"VEHICLE_AGE\").select(avg_ann.VEHICLE_AGE.alias(\"VEHICLE_AGE\"),\n",
435 | " avg_ann.AVERAGE_ANNUAL_PREMIUM_ALL_AGE,\n",
436 | " avg_ann2.AVERAGE_ANNUAL_PREMIUM_30_TO_44)\n",
437 | "\n",
438 | "join_df.show()"
439 | ],
440 | "metadata": {
441 | "colab": {
442 | "base_uri": "https://localhost:8080/"
443 | },
444 | "id": "zc1OkGCWt5Ie",
445 | "outputId": "1888e742-f3c3-4b14-9196-b65e76379704"
446 | },
447 | "execution_count": null,
448 | "outputs": [
449 | {
450 | "output_type": "stream",
451 | "name": "stdout",
452 | "text": [
453 | "----------------------------------------------------------------------------------------\n",
454 | "|\"VEHICLE_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_ALL_AGE\" |\"AVERAGE_ANNUAL_PREMIUM_30_TO_44\" |\n",
455 | "----------------------------------------------------------------------------------------\n",
456 | "|> 2 Years |35654.4994690 |33157.8273078 |\n",
457 | "|1-2 Year |30523.5821203 |28789.0972791 |\n",
458 | "|< 1 Year |30119.5520251 |27853.8153776 |\n",
459 | "----------------------------------------------------------------------------------------\n",
460 | "\n"
461 | ]
462 | }
463 | ]
464 | }
465 | ]
466 | }
467 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/README.md:
--------------------------------------------------------------------------------
1 | # Streamlit Web Application Project with Google Gemini
2 | This repository is for all of my Streamlit Web Application that created by myself. Use this code for your reference only, modify this code if you want to use the real time poject
3 |
4 | ## 1. Auto Sentiment Analysis Twitter (Google Gemini)
5 | This project allow the user to do auto sentiment analysis from your twitter dataset and then visualize the result with Wordcloud and Bi-Gram Visualization. After that the Google Gemini will make a conclusion and actionable insight based on the visualization
6 |
7 | ## 2. Chat With Your CSV (Google Gemini)
8 | This project allow the user to analyze their own CSV dataset. The user should input the target variable and columns for analysis (attribute) for Data Visualization. There are 4 Visualization on this project : Countplot Visualization, Histoplot Visualization, Multiclass Countplot Visualization, Multiclass Histoplot Visualization. After that, the user can chat with Google Gemini about all of the visualized data
9 |
10 | ## 3. CheatGPT (Google Gemini)
11 | This project allow the user to upload their image, and then Google Gemini will answer your question based on the uploaded image. You only need to screenshot the exam question to do this
12 |
13 | ## 4. Complete Pack
14 | This project is actually the complete pack for all of the Data Science project. There are : Machine Learning Classification Prediction, Machine Learning Regresion Prediction, PDF Document Analysis, Sentiment Analysis, CSV File Analysis, Clustering, EDA With Google Gemini
15 |
16 | ## 5. E-Commerce Clustering (Google Gemini, K-Means)
17 | This project allow the user to do clustering method from their CSV File. First thing they have to do is upload a CSV File, then pick 3 numerical column for clustering. After that the user need to define how many cluster that they want. The last step is the system will give the 3D Clustering Visualization and the Google Gemini will give some response based on the 3D Clustering result
18 |
19 | ## 6. Fraud Analysis (Google Gemini)
20 | This project is actually for my Google Gen AI Hackathon (Hack2Skill). The user only need to upload their fraud csv dataset after that the user should inpput the target variable and some column for analysis (attribute). After that the Google Gemini will give 4 Visualization, they are : Countplot Visualization, Histoplot Visualization, Multiclass Countplot Visualization, Multiclass Histoplot Visualization. The Google Gemini will give some Conclusion and Actionable Insight each Visualization
21 |
22 | ## 7. PDF Document Analysis (Google Gemini)
23 | This project allow the user to analyse their PDF File. The user only need to upload their PDF File and put some additional sopwords for data cleansing. After that the system will give the Wordcloud Visualization and Bi-Gram Visualization and lastly Google Gemini will give some conclusion and Actionable Insight based on each Visualization
24 |
25 | ## 8. Table Scraper Analysis (Google Gemini, BeautifulSoup)
26 | This project allow the user to analyze the table from the selected website link. The first thing the users have to do is they have to put the link to analyze and then the system will show all of the available table from the website. After that the user should select column for analysis and remove the unwanted rows. Lastly the Google Gemini will analyze the selected table and give some conclusion and actioonable insight based on the table
27 |
28 |
29 | ## 9. PDF Document Comparison (Google Gemini, Cosine Simmilarity)
30 | This project allow the user to compare 2 PDF Document File and then the system will give the similarity percentage with cosine similarity. The system also will show Word cloud and Bi-Gram Visualization for each documents. Lastly the Google Gini will analyze both Document and give conclusion about the similarities and differences from both documents
31 |
32 |
33 | ## 10. CT Scan and MRI Diagnosis Explanator
34 | This web Application allows to help the doctor and medical officer to analyze about the result of patient CT Scan and MRI image and give some potential abnormalies
35 |
36 | ## 11. LLM Pandas AI and Google Gemini Analysis
37 | This web Application allows to analyze your CSV Dataset and let the user ask anything about their Dataset, then PandasAI will give the answer based on the user question (answers can be dataframe or visualization) and lastly Google Gemini will give some explanation if the answer is a visualization
38 |
39 | ## 12. PDF Documents Comparer
40 | This web Application allows to analyze your PDF File with Langchain and Google Gemini. The users allows to upload 2 PDF File and then ask any question about both PDF File. Then Google Gemini wull analyze the documents
41 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/__pycache__/flask.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Streamlit-Web-Application-main/__pycache__/flask.cpython-311.pyc
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/__pycache__/pandasai.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Streamlit-Web-Application-main/__pycache__/pandasai.cpython-311.pyc
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/auto_sentiment_analysis_twitter.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | sns.set_theme(color_codes=True)
5 | import os
6 | import pathlib
7 | import textwrap
8 | import google.generativeai as genai
9 | from IPython.display import display
10 | from IPython.display import Markdown
11 | import PIL.Image
12 |
13 | st.title("Sentiment Analysis")
14 |
15 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
16 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
17 | from wordcloud import WordCloud
18 | import PyPDF2
19 | import re
20 | from io import StringIO
21 | import plotly.express as px
22 | import pandas as pd
23 | import collections
24 | from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
25 |
26 | # Create stemmer
27 | factory = StemmerFactory()
28 | stemmer = factory.create_stemmer()
29 |
30 | # Create stopword remover
31 | stop_factory = StopWordRemoverFactory()
32 | more_stopword = ['dengan', 'ia', 'bahwa', 'oleh', 'rp', 'undang', 'pasal', 'ayat', 'bab']
33 | data = stop_factory.get_stop_words() + more_stopword
34 |
35 | # Upload the CSV file
36 | uploaded_file = st.file_uploader("Upload CSV file:")
37 |
38 | # User input for delimiter
39 | delimiter_option = st.radio("Select CSV delimiter:", [",", ";"], index=0)
40 |
41 | # Add custom stopwords
42 | custom_stopwords = st.text_input("Enter custom stopwords (comma-separated):")
43 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(",")] if custom_stopwords else []
44 |
45 | # Check if the file is uploaded
46 | if uploaded_file is not None:
47 | # Read the CSV file into a Pandas DataFrame
48 | if delimiter_option == ",":
49 | df = pd.read_csv(uploaded_file, delimiter=",")
50 | elif delimiter_option == ";":
51 | df = pd.read_csv(uploaded_file, delimiter=";")
52 | else:
53 | st.error("Invalid delimiter option.")
54 |
55 | # Show the DataFrame
56 | st.dataframe(df)
57 |
58 | # Select a column for sentiment analysis
59 | object_columns = df.select_dtypes(include="object").columns
60 | target_variable = st.selectbox("Choose a column for Sentiment Analysis:", object_columns)
61 |
62 | # Perform sentiment analysis on the selected column
63 | if st.button("Perform Sentiment Analysis"):
64 | # Your sentiment analysis logic goes here
65 | st.success(f"Sentiment Analysis performed on column: {target_variable}")
66 |
67 | # Show the selected column
68 | st.write(f"Selected {target_variable} Column:")
69 | st.dataframe(df[[target_variable]])
70 |
71 | # Create a new DataFrame with cleaned text column
72 | new_df = df.copy()
73 |
74 | # Create cleaned text column (updated to include custom stopwords)
75 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(",")] if custom_stopwords else []
76 | new_df['cleaned_text'] = new_df[target_variable].apply(lambda x: ' '.join(
77 | [stemmer.stem(word) for word in stop_factory.create_stop_word_remover().remove(x).split()
78 | if word.lower() not in data and word.lower() not in custom_stopword_list] # Exclude custom stopwords
79 | ))
80 |
81 | # Apply stemming and stopword removal to the selected column
82 | new_df['cleaned_text'] = new_df[target_variable].apply(lambda x: ' '.join([stemmer.stem(word) for word in stop_factory.create_stop_word_remover().remove(x).split() if word.lower() not in data]))
83 |
84 | # Show the cleaned text column
85 | #st.write("Cleaned Text Column:")
86 | #st.dataframe(new_df[['cleaned_text']])
87 |
88 | # Load the sentiment analysis pipeline
89 | pretrained = "indonesia-bert-sentiment-classification"
90 | model = AutoModelForSequenceClassification.from_pretrained(pretrained)
91 | tokenizer = AutoTokenizer.from_pretrained(pretrained)
92 | sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
93 | label_index = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'}
94 |
95 | # Function to apply sentiment analysis to each row in the 'cleaned_text' column
96 | def analyze_sentiment(text):
97 | result = sentiment_analysis(text)
98 | label = label_index[result[0]['label']]
99 | score = result[0]['score']
100 | return pd.Series({'sentiment_label': label, 'sentiment_score': score})
101 |
102 | # Apply sentiment analysis to 'cleaned_text' column
103 | new_df[['sentiment_label', 'sentiment_score']] = new_df['cleaned_text'].apply(analyze_sentiment)
104 |
105 | # Display the results
106 | st.write("Sentiment Analysis Results:")
107 | st.dataframe(new_df[['cleaned_text', 'sentiment_label', 'sentiment_score']])
108 |
109 | # Count the occurrences of each sentiment label
110 | sentiment_counts = new_df['sentiment_label'].value_counts()
111 |
112 | # Plot a bar chart using seaborn
113 | st.set_option('deprecation.showPyplotGlobalUse', False)
114 | sns.set(style="whitegrid")
115 | plt.figure(figsize=(8, 6))
116 | sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
117 | plt.title('Sentiment Distribution')
118 | plt.xlabel('Sentiment Label')
119 | plt.ylabel('Count')
120 | st.pyplot()
121 |
122 | # Define a dictionary to store sentiment-wise text
123 | sentiment_text = {
124 | "positive": "",
125 | "neutral": "",
126 | "negative": ""
127 | }
128 |
129 | # Loop through each sentiment label
130 | for label in sentiment_counts.index:
131 | # Filter data for the current sentiment
132 | selected_data = new_df[new_df['sentiment_label'] == label]
133 |
134 | # Include custom stopwords back into the cleaned text before concatenation
135 | selected_data['cleaned_text'] = selected_data['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in data and word.lower() not in custom_stopword_list])) # Remove only general stopwords
136 |
137 | # Concatenate cleaned text from the selected data (now including custom stopwords)
138 | sentiment_text[label] = ' '.join(selected_data['cleaned_text'].astype(str))
139 |
140 |
141 | # Define variables for sentiment-wise text (adjust variable names)
142 | #positive_text = ""
143 | #neutral_text = ""
144 | #negative_text = ""
145 |
146 |
147 | # Concatenate cleaned text for each sentiment
148 | positive_text = ' '.join([word for word in new_df[new_df['sentiment_label'] == 'positive']['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in data and w.lower() not in custom_stopword_list]))])
149 | neutral_text = ' '.join([word for word in new_df[new_df['sentiment_label'] == 'neutral']['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in data and w.lower() not in custom_stopword_list]))])
150 | negative_text = ' '.join([word for word in new_df[new_df['sentiment_label'] == 'negative']['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in data and w.lower() not in custom_stopword_list]))])
151 |
152 |
153 |
154 | # Generate WordCloud for positive sentiment
155 | positive_wordcloud = WordCloud(
156 | min_font_size=3, max_words=200, width=800, height=400,
157 | colormap='viridis', background_color='white'
158 | ).generate(positive_text)
159 |
160 | # Save the WordCloud image with a filename
161 | positive_wordcloud_filename = "wordcloud_positive.png"
162 | positive_wordcloud.to_file(positive_wordcloud_filename)
163 |
164 | # Display the saved WordCloud image using Streamlit
165 | st.subheader("WordCloud for Positive Sentiment")
166 | st.image(positive_wordcloud_filename)
167 |
168 | def to_markdown(text):
169 | text = text.replace('•', ' *')
170 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
171 |
172 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
173 |
174 | import PIL.Image
175 |
176 | img = PIL.Image.open("wordcloud_positive.png")
177 | model = genai.GenerativeModel('gemini-pro-vision')
178 | response = model.generate_content(img)
179 |
180 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about wordcloud positive sentiment", img])
181 | response.resolve()
182 | st.write("**Google Gemini Response About Data**")
183 | st.write(response.text)
184 |
185 |
186 | # Generate WordCloud for neutral sentiment
187 | neutral_wordcloud = WordCloud(
188 | min_font_size=3, max_words=200, width=800, height=400,
189 | colormap='viridis', background_color='white'
190 | ).generate(neutral_text)
191 |
192 | # Save the WordCloud image with a filename
193 | neutral_wordcloud_filename = "wordcloud_neutral.png"
194 | neutral_wordcloud.to_file(neutral_wordcloud_filename)
195 |
196 | # Display the saved WordCloud image using Streamlit
197 | st.subheader("WordCloud for Neutral Sentiment")
198 | st.image(neutral_wordcloud_filename)
199 |
200 | def to_markdown(text):
201 | text = text.replace('•', ' *')
202 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
203 |
204 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
205 |
206 | import PIL.Image
207 |
208 | img = PIL.Image.open("wordcloud_neutral.png")
209 | model = genai.GenerativeModel('gemini-pro-vision')
210 | response = model.generate_content(img)
211 |
212 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about wordcloud neutral sentiment", img])
213 | response.resolve()
214 | st.write("**Google Gemini Response About Data**")
215 | st.write(response.text)
216 |
217 |
218 |
219 | # Generate WordCloud for negative sentiment
220 | negative_wordcloud = WordCloud(
221 | min_font_size=3, max_words=200, width=800, height=400,
222 | colormap='viridis', background_color='white'
223 | ).generate(negative_text)
224 |
225 | # Save the WordCloud image with a filename
226 | negative_wordcloud_filename = "wordcloud_negative.png"
227 | negative_wordcloud.to_file(negative_wordcloud_filename)
228 |
229 | # Display the saved WordCloud image using Streamlit
230 | st.subheader("WordCloud for Negative Sentiment")
231 | st.image(negative_wordcloud_filename)
232 |
233 | def to_markdown(text):
234 | text = text.replace('•', ' *')
235 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
236 |
237 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
238 |
239 | import PIL.Image
240 |
241 | img = PIL.Image.open("wordcloud_negative.png")
242 | model = genai.GenerativeModel('gemini-pro-vision')
243 | response = model.generate_content(img)
244 |
245 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about wordcloud negative sentiment", img])
246 | response.resolve()
247 | st.write("**Google Gemini Response About Data**")
248 | st.write(response.text)
249 |
250 |
251 | # Bigrams Positive Sentiment
252 | words1 = positive_text.split()
253 | # Get bigrams
254 | bigrams = list(zip(words1, words1[1:]))
255 |
256 | # Count bigrams
257 | bigram_counts = collections.Counter(bigrams)
258 |
259 | # Get top 10 bigram counts
260 | top_bigrams = dict(bigram_counts.most_common(10))
261 |
262 | # Create bar chart
263 | plt.figure(figsize=(10, 7))
264 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
265 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
266 | plt.xlabel('Bigram Words')
267 | plt.ylabel('Count')
268 | plt.title(f"Top 10 Bigram for Positive Sentiment")
269 | # Save the entire plot as a PNG
270 | plt.tight_layout()
271 | plt.savefig("bigram_positive.png")
272 | st.subheader("Bigram for Positive Sentiment")
273 | st.image("bigram_positive.png")
274 |
275 | def to_markdown(text):
276 | text = text.replace('•', ' *')
277 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
278 |
279 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
280 |
281 | import PIL.Image
282 |
283 | img = PIL.Image.open("bigram_positive.png")
284 | model = genai.GenerativeModel('gemini-pro-vision')
285 | response = model.generate_content(img)
286 |
287 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about bigram positive sentiment", img])
288 | response.resolve()
289 | st.write("**Google Gemini Response About Data**")
290 | st.write(response.text)
291 |
292 |
293 |
294 | # Bigrams Neutral Sentiment
295 | words1 = neutral_text.split()
296 | # Get bigrams
297 | bigrams = list(zip(words1, words1[1:]))
298 |
299 | # Count bigrams
300 | bigram_counts = collections.Counter(bigrams)
301 |
302 | # Get top 10 bigram counts
303 | top_bigrams = dict(bigram_counts.most_common(10))
304 |
305 | # Create bar chart
306 | plt.figure(figsize=(10, 7))
307 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
308 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
309 | plt.xlabel('Bigram Words')
310 | plt.ylabel('Count')
311 | plt.title(f"Top 10 Bigram for Neutral Sentiment")
312 | # Save the entire plot as a PNG
313 | plt.tight_layout()
314 | plt.savefig("bigram_neutral.png")
315 | st.subheader("Bigram for Neutral Sentiment")
316 | st.image("bigram_neutral.png")
317 |
318 | def to_markdown(text):
319 | text = text.replace('•', ' *')
320 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
321 |
322 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
323 |
324 | import PIL.Image
325 |
326 | img = PIL.Image.open("bigram_neutral.png")
327 | model = genai.GenerativeModel('gemini-pro-vision')
328 | response = model.generate_content(img)
329 |
330 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about bigram neutral sentiment", img])
331 | response.resolve()
332 | st.write("**Google Gemini Response About Data**")
333 | st.write(response.text)
334 |
335 |
336 |
337 | # Bigrams Negative Sentiment
338 | words1 = negative_text.split()
339 | # Get bigrams
340 | bigrams = list(zip(words1, words1[1:]))
341 |
342 | # Count bigrams
343 | bigram_counts = collections.Counter(bigrams)
344 |
345 | # Get top 10 bigram counts
346 | top_bigrams = dict(bigram_counts.most_common(10))
347 |
348 | # Create bar chart
349 | plt.figure(figsize=(10, 7))
350 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
351 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
352 | plt.xlabel('Bigram Words')
353 | plt.ylabel('Count')
354 | plt.title(f"Top 10 Bigram for negative Sentiment")
355 | # Save the entire plot as a PNG
356 | plt.tight_layout()
357 | plt.savefig("bigram_negative.png")
358 | st.subheader("Bigram for Negative Sentiment")
359 | st.image("bigram_negative.png")
360 |
361 | def to_markdown(text):
362 | text = text.replace('•', ' *')
363 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
364 |
365 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
366 |
367 | import PIL.Image
368 |
369 | img = PIL.Image.open("bigram_negative.png")
370 | model = genai.GenerativeModel('gemini-pro-vision')
371 | response = model.generate_content(img)
372 |
373 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image about bigram negative sentiment", img])
374 | response.resolve()
375 | st.write("**Google Gemini Response About Data**")
376 | st.write(response.text)
377 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/chat_with_your_csv.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import random
3 | import time
4 | import pandas as pd
5 | import seaborn as sns
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | from scipy import stats
9 | import warnings
10 | sns.set_theme(color_codes=True)
11 | import os
12 | import pathlib
13 | import textwrap
14 | import google.generativeai as genai
15 | from IPython.display import display
16 | from IPython.display import Markdown
17 | import time
18 |
19 |
20 | st.title("EDA with Google Gemini")
21 |
22 | # Upload the CSV file
23 | uploaded_file = st.file_uploader("Upload CSV file:")
24 |
25 | # Check if the file is uploaded
26 | if uploaded_file is not None:
27 | # Read the CSV file into a Pandas DataFrame
28 | df = pd.read_csv(uploaded_file)
29 |
30 | # Show the original DataFrame
31 | st.write("Original DataFrame:")
32 | st.dataframe(df)
33 |
34 |
35 |
36 | st.write("**Countplot Barchart**")
37 |
38 | # Get the names of all columns with data type 'object' (categorical columns) excluding 'Country'
39 | cat_vars = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() > 1 and df[col].nunique() <= 10]
40 |
41 | # Create a figure with subplots
42 | num_cols = len(cat_vars)
43 | num_rows = (num_cols + 2) // 3
44 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
45 | axs = axs.flatten()
46 |
47 | # Create a countplot for the top 10 values of each categorical variable using Seaborn
48 | for i, var in enumerate(cat_vars):
49 | top_values = df[var].value_counts().head(10).index
50 | filtered_df = df.copy()
51 | filtered_df[var] = df[var].apply(lambda x: x if x in top_values else 'Other')
52 | sns.countplot(x=var, data=filtered_df, ax=axs[i])
53 | axs[i].set_title(var)
54 | axs[i].tick_params(axis='x', rotation=90)
55 |
56 | # Remove any extra empty subplots if needed
57 | if num_cols < len(axs):
58 | for i in range(num_cols, len(axs)):
59 | fig.delaxes(axs[i])
60 |
61 | # Adjust spacing between subplots
62 | fig.tight_layout()
63 |
64 | # Show plots using Streamlit
65 | st.pyplot(fig)
66 | fig.savefig("plot4.png")
67 |
68 |
69 |
70 | st.write("**Histoplot**")
71 | # Get the names of all columns with data type 'int' or 'float'
72 | num_vars = [col for col in df.select_dtypes(include=['int', 'float']).columns]
73 |
74 | # Create a figure with subplots
75 | num_cols = len(num_vars)
76 | num_rows = (num_cols + 2) // 3
77 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
78 | axs = axs.flatten()
79 |
80 | # Create a histplot for each numeric variable using Seaborn
81 | for i, var in enumerate(num_vars):
82 | sns.histplot(df[var], ax=axs[i], kde=True)
83 | axs[i].set_title(var)
84 | axs[i].set_xlabel('')
85 |
86 | # Remove any extra empty subplots if needed
87 | if num_cols < len(axs):
88 | for i in range(num_cols, len(axs)):
89 | fig.delaxes(axs[i])
90 |
91 | # Adjust spacing between subplots
92 | fig.tight_layout()
93 |
94 | # Show plots using Streamlit
95 | st.pyplot(fig)
96 | fig.savefig("plot7.png")
97 |
98 |
99 |
100 | # Select target variable
101 | target_variable = st.selectbox("Select target variable:", df.columns)
102 |
103 | # Select columns for analysis
104 | columns_for_analysis = st.multiselect("Select columns for analysis:", [col for col in df.columns if col != target_variable])
105 |
106 | # Process button
107 | if st.button("Process"):
108 | # Select the target variable and columns for analysis from the original DataFrame
109 | target_variable_data = df[target_variable]
110 | columns_for_analysis_data = df[columns_for_analysis]
111 |
112 | # Display target variable in a dataframe
113 | target_variable_df = df[[target_variable]]
114 | st.write("Target Variable DataFrame:")
115 | st.dataframe(target_variable_df)
116 |
117 | # Display columns for analysis in a dataframe
118 | columns_for_analysis_df = df[columns_for_analysis]
119 | st.write("Columns for Analysis DataFrame:")
120 | st.dataframe(columns_for_analysis_df)
121 |
122 | # Concatenate target variable and columns for analysis into a single DataFrame
123 | df = pd.concat([target_variable_data, columns_for_analysis_data], axis=1)
124 | st.write("Columns for Analysis and Target Variable DataFrame:")
125 | st.dataframe(df)
126 |
127 | # Drop columns with null values more than 25%
128 | null_percentage = df.isnull().sum() / len(df)
129 | columns_to_drop = null_percentage[null_percentage > 0.25].index
130 | df.drop(columns=columns_to_drop, inplace=True)
131 |
132 | # Fill missing values below 25% with median
133 | for col in df.columns:
134 | if df[col].isnull().sum() > 0: # Check if there are missing values
135 | if null_percentage[col] <= 0.25:
136 | if df[col].dtype in ['float64', 'int64']: # Check if missing values are below 25%
137 | median_value = df[col].median() # Calculate median for the column
138 | df[col].fillna(median_value, inplace=True)
139 |
140 | # Convert object datatype columns to lowercase
141 | for col in df.columns:
142 | if df[col].dtype == 'object': # Check if datatype is object
143 | df[col] = df[col].str.lower() # Convert values to lowercase
144 |
145 | st.write("Cleaned Dataset")
146 | st.dataframe(df)
147 |
148 |
149 | st.write("**Multiclass Barplot**")
150 | # Get the names of all columns with data type 'object' (categorical columns)
151 | cat_cols = df.columns.tolist()
152 |
153 | # Get the names of all columns with data type 'object' (categorical variables)
154 | cat_vars = df.select_dtypes(include=['object']).columns.tolist()
155 |
156 | # Exclude 'Country' from the list if it exists in cat_vars
157 | if target_variable in cat_vars:
158 | cat_vars.remove(target_variable)
159 |
160 | # Create a figure with subplots, but only include the required number of subplots
161 | num_cols = len(cat_vars)
162 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots
163 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
164 | axs = axs.flatten()
165 |
166 | # Create a count plot for each categorical variable
167 | for i, var in enumerate(cat_vars):
168 | top_categories = df[var].value_counts().nlargest(10).index
169 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)] # Exclude rows with NaN values in the variable
170 | sns.countplot(x=var, hue=target_variable, data=filtered_df, ax=axs[i])
171 | axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90)
172 |
173 | # Remove any remaining blank subplots
174 | for i in range(num_cols, len(axs)):
175 | fig.delaxes(axs[i])
176 |
177 | # Adjust spacing between subplots
178 | fig.tight_layout()
179 |
180 | # Show plot
181 | st.pyplot(fig)
182 | fig.savefig("plot2.png")
183 |
184 |
185 |
186 |
187 | st.write("**Multiclass Histplot**")
188 | # Get the names of all columns with data type 'object' (categorical columns)
189 | cat_cols = df.columns.tolist()
190 |
191 | # Get the names of all columns with data type 'int'
192 | int_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()
193 | int_vars = [col for col in int_vars if col != target_variable]
194 |
195 | # Create a figure with subplots
196 | num_cols = len(int_vars)
197 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots
198 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
199 | axs = axs.flatten()
200 |
201 | # Create a histogram for each integer variable with hue='Attrition'
202 | for i, var in enumerate(int_vars):
203 | top_categories = df[var].value_counts().nlargest(10).index
204 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)]
205 | sns.histplot(data=df, x=var, hue=target_variable, kde=True, ax=axs[i])
206 | axs[i].set_title(var)
207 |
208 | # Remove any extra empty subplots if needed
209 | if num_cols < len(axs):
210 | for i in range(num_cols, len(axs)):
211 | fig.delaxes(axs[i])
212 |
213 | # Adjust spacing between subplots
214 | fig.tight_layout()
215 |
216 | # Show plot
217 | st.pyplot(fig)
218 | fig.savefig("plot3.png")
219 |
220 |
221 | # Define the paths to the saved plots
222 | plot_paths = ["plot4.png", "plot7.png", "plot2.png", "plot3.png"]
223 |
224 | # Create a new figure
225 | fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 15))
226 |
227 | # Iterate over each plot path and place it in the corresponding subplot
228 | for i, plot_path in enumerate(plot_paths):
229 | row = i // 2
230 | col = i % 2
231 | img = plt.imread(plot_path)
232 | axs[row, col].imshow(img)
233 | axs[row, col].axis('off')
234 |
235 | # Adjust spacing between subplots
236 | plt.tight_layout()
237 |
238 | # Save the merged plot
239 | fig.savefig("merged_plots.png")
240 |
241 | # Streamed response emulator
242 |
243 | def to_markdown(text):
244 | text = text.replace('•', ' *')
245 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
246 |
247 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
248 |
249 | import PIL.Image
250 |
251 | img = PIL.Image.open("merged_plots.png")
252 | model = genai.GenerativeModel('gemini-pro-vision')
253 | response = model.generate_content(img)
254 |
255 | def response_generator():
256 | response = response.text
257 |
258 |
259 | for word in response.split():
260 | yield word + " "
261 | time.sleep(0.05)
262 |
263 |
264 | st.title("Chat with your Data")
265 |
266 | # Initialize chat history
267 | if "messages" not in st.session_state:
268 | st.session_state.messages = []
269 |
270 | # Display chat messages from history on app rerun
271 | for message in st.session_state.messages:
272 | with st.chat_message(message["role"]):
273 | st.markdown(message["content"])
274 |
275 | # Accept user input
276 | if prompt := st.chat_input("Ask Your Data"):
277 | # Add user message to chat history
278 | st.session_state.messages.append({"role": "user", "content": prompt})
279 | # Display user message in chat message container
280 | with st.chat_message("user"):
281 | st.markdown(prompt)
282 |
283 | # Generate Google Gemini response based on user's question
284 | img = PIL.Image.open("merged_plots.png")
285 | model = genai.GenerativeModel('gemini-pro-vision')
286 | response = model.generate_content([prompt, img], stream=True)
287 | response.resolve()
288 |
289 | # Format and display the response
290 | response_text = response.text
291 | response_markdown = to_markdown(response_text)
292 | st.write(response.text)
293 |
294 | # Add assistant response to chat history
295 | st.session_state.messages.append({"role": "assistant", "content": response_text})
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/cheatgpt.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from PIL import Image
3 | import io
4 | import textwrap
5 | import google.generativeai as genai
6 | from IPython.display import display
7 | from IPython.display import Markdown
8 |
9 | st.title("CheatGPT")
10 |
11 | uploaded_file = st.file_uploader("Upload your PNG or JPG image:", type=["png", "jpg"])
12 |
13 | if uploaded_file is not None:
14 |
15 | # Validate the file extension
16 | if uploaded_file.type in ["image/png", "image/jpeg"]:
17 | # Read the image bytes
18 | img_bytes = uploaded_file.read()
19 |
20 | # Convert bytes to PIL Image object
21 | img = Image.open(io.BytesIO(img_bytes))
22 | st.write("Image Uploaded")
23 | st.image(img)
24 |
25 | img.save("image.png")
26 |
27 | def to_markdown(text):
28 | text = text.replace('•', ' *')
29 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
30 |
31 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
32 |
33 | import PIL.Image
34 |
35 | img1 = PIL.Image.open("image.png")
36 | model = genai.GenerativeModel('gemini-pro-vision')
37 | response = model.generate_content(img)
38 |
39 | response = model.generate_content(["Answer This Question and give the explanation", img1], stream=True)
40 | response.resolve()
41 | st.write("**Google Gemini Response About Data**")
42 | st.write(response.text)
43 |
44 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/compare.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import streamlit as st
4 | from langchain.chains import StuffDocumentsChain
5 | from langchain.chains.llm import LLMChain
6 | from langchain.prompts import PromptTemplate
7 | from langchain_community.document_loaders import PyPDFLoader
8 | from langchain_google_genai import ChatGoogleGenerativeAI
9 |
10 | # Title of the app
11 | st.title("PDF Document Comparer Analysis")
12 |
13 | # Upload the PDF files
14 | uploaded_file1 = st.file_uploader("Upload First PDF file:", type='pdf')
15 | uploaded_file2 = st.file_uploader("Upload Second PDF file:", type='pdf')
16 | question = st.text_input("Insert Question", "Put your question here about both documents")
17 |
18 | async def process_files():
19 | if uploaded_file1 and uploaded_file2 and question:
20 | # Save the uploaded files as file1.pdf and file2.pdf
21 | file1_path = "file1.pdf"
22 | file2_path = "file2.pdf"
23 | with open(file1_path, "wb") as f1:
24 | f1.write(uploaded_file1.getbuffer())
25 | with open(file2_path, "wb") as f2:
26 | f2.write(uploaded_file2.getbuffer())
27 |
28 | # Initialize the LLM with the Google API key
29 | llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", google_api_key="AIzaSyCFI6cTqFdS-mpZBfi7kxwygewtnuF7PfA")
30 |
31 | # Load the PDF files
32 | loader1 = PyPDFLoader(file1_path)
33 | loader2 = PyPDFLoader(file2_path)
34 | docs1 = loader1.load()
35 | docs2 = loader2.load()
36 | docs3 = docs1 + docs2
37 |
38 | # Define the Summarize Chain
39 | template = """Write a concise summary of the following:
40 | "{text}"
41 | CONCISE SUMMARY:"""
42 | prompt = PromptTemplate.from_template(template)
43 | llm_chain = LLMChain(llm=llm, prompt=prompt)
44 | stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
45 |
46 | # Process both documents
47 | response1 = stuff_chain.invoke(docs1)
48 | response2 = stuff_chain.invoke(docs2)
49 |
50 | # Display the summaries
51 | st.markdown("### Summary of the First Document")
52 | st.write(response1["output_text"])
53 |
54 | st.markdown("### Summary of the Second Document")
55 | st.write(response2["output_text"])
56 |
57 | # Additional comparison logic can be added here based on the question
58 | comparison_template = question + """Write a concise summary of the following:
59 | "{text}"
60 | CONCISE SUMMARY:"""
61 |
62 | prompt1 = PromptTemplate.from_template(comparison_template)
63 | llm_chain1 = LLMChain(llm=llm, prompt=prompt1)
64 | stuff_chain1 = StuffDocumentsChain(llm_chain=llm_chain1, document_variable_name="text")
65 | response3 = stuff_chain1.invoke(docs3)
66 |
67 | # Display the comparison result
68 | st.markdown("### Comparison Result")
69 | st.write(response3["output_text"])
70 |
71 | # Clean up the temporary files
72 | os.remove(uploaded_file1.name)
73 | os.remove(uploaded_file2.name)
74 |
75 | if st.button("Process"):
76 | asyncio.run(process_files())
77 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/diagnosis.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import datetime
3 | import os
4 | import PIL.Image
5 | import google.generativeai as genai
6 | from IPython.display import Markdown
7 | import time
8 | import io
9 | from PIL import Image
10 | import textwrap
11 |
12 | # Replace with your GenerativeAI API key
13 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
14 |
15 | st.title("CT Scan and MRI Diagnosis Explanator")
16 |
17 | # Initialize chat history
18 | if "messages" not in st.session_state:
19 | st.session_state.messages = []
20 |
21 | # Display chat messages from history on app rerun
22 | for message in st.session_state.messages:
23 | with st.chat_message(message["role"]):
24 | st.markdown(message["content"])
25 |
26 | # Upload an image file
27 | uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"])
28 |
29 | if uploaded_file is not None:
30 | if uploaded_file.type in ["image/png", "image/jpeg"]:
31 | img_bytes = uploaded_file.read()
32 | img = Image.open(io.BytesIO(img_bytes))
33 | st.write("Image Uploaded")
34 | st.image(img)
35 |
36 | img.save("image.png")
37 |
38 | def to_markdown(text): # Consider removing if formatting not needed
39 | text = text.replace('•', ' *')
40 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
41 |
42 | model = genai.GenerativeModel('gemini-pro-vision') # Check supported models
43 | response = model.generate_content(["Can you analyze this CT scan or MRI and explain any potential abnormalities?", img], stream=True)
44 | response.resolve()
45 |
46 | st.write("**Google Gemini Response About the image**")
47 |
48 |
49 | # Extract text from all candidates (GitHub solution)
50 | text_parts = []
51 | for candidate in response.candidates:
52 | text_parts.extend([part.text for part in candidate.content.parts])
53 | full_text = ''.join(text_parts) # Join text parts for a cohesive response
54 |
55 | st.write(full_text) # Display the combined text
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/ecommerce_clustering_llm.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import seaborn as sns
4 | import matplotlib.pyplot as plt
5 | from sklearn.cluster import KMeans
6 | from sklearn.metrics import silhouette_score
7 | from mpl_toolkits.mplot3d import Axes3D
8 | sns.set_theme(color_codes=True)
9 | import os
10 | import pathlib
11 | import textwrap
12 | import google.generativeai as genai
13 | from IPython.display import display
14 | from IPython.display import Markdown
15 | import streamlit as st
16 |
17 | st.title("Ecommerce Segmentation Analysis")
18 |
19 | # Upload the CSV file
20 | uploaded_file = st.file_uploader("Upload CSV file:")
21 |
22 | # Check if the file is uploaded
23 | if uploaded_file is not None:
24 | # Read the CSV file into a Pandas DataFrame
25 | df = pd.read_csv(uploaded_file)
26 |
27 | # Show the DataFrame
28 | st.dataframe(df)
29 |
30 | # Get numeric columns for clustering
31 | numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
32 | clustering_columns = st.multiselect("Select numeric columns for clustering:", numeric_columns)
33 |
34 | # Check if at least 3 columns are selected
35 | if len(clustering_columns) != 3:
36 | st.warning("Please select exactly 3 numeric columns for clustering.")
37 | else:
38 | # Display the selected columns
39 | st.subheader("Selected Columns for Clustering:")
40 | selected_data = df[clustering_columns]
41 | st.dataframe(selected_data)
42 |
43 | # Remove missing values
44 | selected_data.dropna(inplace=True)
45 |
46 | def visualize_clustering(df, selected_data):
47 | # Visualize the Elbow Method to find optimal clusters
48 | wcss = []
49 | for i in range(1, 11):
50 | kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
51 | kmeans.fit(selected_data)
52 | wcss.append(kmeans.inertia_)
53 |
54 | # Plot the Elbow Method
55 | st.subheader("Elbow Method to Determine Optimal Clusters")
56 | fig, ax = plt.subplots(figsize=(8, 5))
57 | ax.plot(range(1, 11), wcss, marker='o')
58 | ax.set_title('Elbow Method')
59 | ax.set_xlabel('Number of Clusters')
60 | ax.set_ylabel('WCSS') # Within-Cluster Sum of Squares
61 | st.pyplot(fig)
62 |
63 | # Visualize Silhouette Score for different cluster numbers
64 | silhouette_scores = []
65 | for n_clusters in range(2, 11):
66 | kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
67 | kmeans.fit(selected_data)
68 | silhouette_avg = silhouette_score(selected_data, kmeans.labels_)
69 | silhouette_scores.append(silhouette_avg)
70 |
71 | # Plot Silhouette Score
72 | st.subheader("Silhouette Score for Different Cluster Numbers")
73 | fig, ax = plt.subplots(figsize=(8, 5))
74 | ax.plot(range(2, 11), silhouette_scores, marker='o')
75 | ax.set_title('Silhouette Score')
76 | ax.set_xlabel('Number of Clusters')
77 | ax.set_ylabel('Silhouette Score')
78 | st.pyplot(fig)
79 |
80 | # Apply KMeans clustering based on user-selected number of clusters
81 | num_clusters = st.slider("Select the number of clusters (2-10):", 2, 10, 3)
82 | kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
83 | cluster_labels = kmeans.fit_predict(selected_data)
84 |
85 | # Create a new DataFrame with the cluster labels
86 | clustered_df = pd.DataFrame(cluster_labels, columns=['cluster'], index=selected_data.index)
87 |
88 | # Concatenate the clustered_df with the original DataFrame
89 | df = pd.concat([df, clustered_df], axis=1)
90 | st.subheader("Clustered Dataset")
91 | st.dataframe(df)
92 |
93 | # Visualize clustering results in 3D plot
94 | fig = plt.figure(figsize=(10, 12))
95 | ax = fig.add_subplot(111, projection='3d')
96 | scatter = ax.scatter(selected_data[clustering_columns[0]],
97 | selected_data[clustering_columns[1]],
98 | selected_data[clustering_columns[2]],
99 | c=cluster_labels, cmap='viridis', s=50)
100 |
101 | ax.set_xlabel(clustering_columns[0])
102 | ax.set_ylabel(clustering_columns[1])
103 | ax.set_zlabel(clustering_columns[2])
104 | ax.set_title(f'3D Clustering (Cluster Amount = {num_clusters})')
105 |
106 | # Add a legend
107 | legend = ax.legend(*scatter.legend_elements(), title="Clusters")
108 | ax.add_artist(legend)
109 |
110 | # Show the 3D plot
111 | st.pyplot(fig)
112 | fig.savefig("plot8.png")
113 |
114 | # Visualize clustering
115 | visualize_clustering(df, selected_data)
116 |
117 |
118 | def to_markdown(text):
119 | text = text.replace('•', ' *')
120 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
121 |
122 | genai.configure(api_key="AIzaSyCY-mXpPt-J0oGRaSiPaeAyAVollbMxCF8")
123 |
124 | import PIL.Image
125 |
126 | img = PIL.Image.open("plot8.png")
127 | model = genai.GenerativeModel('gemini-pro-vision')
128 | response = model.generate_content(img)
129 |
130 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the each cluster colour. write the conclusion in English", img], stream=True)
131 | response.resolve()
132 | st.subheader("**Google Gemini Response About Data**")
133 | st.write(response.text)
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/fraud_analysis_llm.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import streamlit as st
6 | sns.set_theme(color_codes=True)
7 | import textwrap
8 | import google.generativeai as genai
9 | from IPython.display import display
10 | from IPython.display import Markdown
11 |
12 | st.title("Fraud Analysis and Detection with Google Gen AI")
13 |
14 | # Upload the CSV file
15 | uploaded_file = st.file_uploader("Upload CSV file:")
16 |
17 | # Check if the file is uploaded
18 | if uploaded_file is not None:
19 | # Read the CSV file into a Pandas DataFrame
20 | df = pd.read_csv(uploaded_file)
21 |
22 | # Show the original DataFrame
23 | st.write("Original DataFrame:")
24 | st.dataframe(df)
25 |
26 | # Data Cleansing
27 | for col in df.columns:
28 | if 'value' in col or 'price' in col or 'cost' in col or 'amount' in col or 'Value' in col or 'Price' in col or 'Cost' in col or 'Amount' in col:
29 | df[col] = df[col].str.replace('$', '')
30 | df[col] = df[col].str.replace('£', '')
31 | df[col] = df[col].str.replace('€', '')
32 | # Remove non-numeric characters
33 | df[col] = df[col].replace('[^\d.-]', '', regex=True).astype(float)
34 |
35 | # Drop columns with null values more than 25%
36 | null_percentage = df.isnull().sum() / len(df)
37 | columns_to_drop = null_percentage[null_percentage > 0.25].index
38 | df.drop(columns=columns_to_drop, inplace=True)
39 |
40 | # Fill missing values below 25% with median
41 | for col in df.columns:
42 | if df[col].isnull().sum() > 0: # Check if there are missing values
43 | if null_percentage[col] <= 0.25:
44 | if df[col].dtype in ['float64', 'int64']: # Check if missing values are below 25%
45 | median_value = df[col].median() # Calculate median for the column
46 | df[col].fillna(median_value, inplace=True)
47 |
48 | # Convert object datatype columns to lowercase
49 | for col in df.columns:
50 | if df[col].dtype == 'object': # Check if datatype is object
51 | df[col] = df[col].str.lower() # Convert values to lowercase
52 |
53 | st.write("Cleaned Dataset")
54 | st.dataframe(df)
55 |
56 |
57 |
58 | st.write("**Countplot Barchart**")
59 |
60 | # Get the names of all columns with data type 'object' (categorical columns) excluding 'Country'
61 | cat_vars = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() > 1 and df[col].nunique() <= 10]
62 |
63 | # Create a figure with subplots
64 | num_cols = len(cat_vars)
65 | num_rows = (num_cols + 2) // 3
66 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
67 | axs = axs.flatten()
68 |
69 | # Create a countplot for the top 10 values of each categorical variable using Seaborn
70 | for i, var in enumerate(cat_vars):
71 | top_values = df[var].value_counts().head(10).index
72 | filtered_df = df.copy()
73 | filtered_df[var] = df[var].apply(lambda x: x if x in top_values else 'Other')
74 | sns.countplot(x=var, data=filtered_df, ax=axs[i])
75 | axs[i].set_title(var)
76 | axs[i].tick_params(axis='x', rotation=90)
77 |
78 | # Remove any extra empty subplots if needed
79 | if num_cols < len(axs):
80 | for i in range(num_cols, len(axs)):
81 | fig.delaxes(axs[i])
82 |
83 | # Adjust spacing between subplots
84 | fig.tight_layout()
85 |
86 | # Show plots using Streamlit
87 | st.pyplot(fig)
88 | fig.savefig("plot4.png")
89 |
90 | def to_markdown(text):
91 | text = text.replace('•', ' *')
92 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
93 |
94 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
95 |
96 | import PIL.Image
97 |
98 | img = PIL.Image.open("plot4.png")
99 | model = genai.GenerativeModel('gemini-pro-vision')
100 | response = model.generate_content(img)
101 |
102 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True)
103 | response.resolve()
104 | st.write("**Google Gemini Response About Data**")
105 | st.write(response.text)
106 |
107 |
108 |
109 | # Get the names of all columns with data type 'int' or 'float'
110 | num_vars = [col for col in df.select_dtypes(include=['int', 'float']).columns]
111 |
112 | # Create a figure with subplots
113 | num_cols = len(num_vars)
114 | num_rows = (num_cols + 2) // 3
115 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
116 | axs = axs.flatten()
117 |
118 | # Create a histplot for each numeric variable using Seaborn
119 | for i, var in enumerate(num_vars):
120 | sns.histplot(df[var], ax=axs[i], kde=True)
121 | axs[i].set_title(var)
122 | axs[i].set_xlabel('')
123 |
124 | # Remove any extra empty subplots if needed
125 | if num_cols < len(axs):
126 | for i in range(num_cols, len(axs)):
127 | fig.delaxes(axs[i])
128 |
129 | # Adjust spacing between subplots
130 | fig.tight_layout()
131 |
132 | # Show plots using Streamlit
133 | st.pyplot(fig)
134 | fig.savefig("plot5.png")
135 |
136 | def to_markdown(text):
137 | text = text.replace('•', ' *')
138 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
139 |
140 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
141 |
142 | img = PIL.Image.open("plot5.png")
143 | model = genai.GenerativeModel('gemini-pro-vision')
144 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True)
145 | response.resolve()
146 | st.write("**Google Gemini Response About Data**")
147 | st.write(response.text)
148 |
149 |
150 | # Select target variable
151 | target_variable = st.selectbox("Select target variable:", df.columns)
152 |
153 | # Select columns for analysis
154 | columns_for_analysis = st.multiselect("Select columns for analysis:", [col for col in df.columns if col != target_variable])
155 |
156 | # Process button
157 | if st.button("Process"):
158 | # Select the target variable and columns for analysis from the original DataFrame
159 | target_variable_data = df[target_variable]
160 | columns_for_analysis_data = df[columns_for_analysis]
161 |
162 | # Display target variable in a dataframe
163 | target_variable_df = df[[target_variable]]
164 | st.write("Target Variable DataFrame:")
165 | st.dataframe(target_variable_df)
166 |
167 | # Display columns for analysis in a dataframe
168 | columns_for_analysis_df = df[columns_for_analysis]
169 | st.write("Columns for Analysis DataFrame:")
170 | st.dataframe(columns_for_analysis_df)
171 |
172 | # Concatenate target variable and columns for analysis into a single DataFrame
173 | df = pd.concat([target_variable_data, columns_for_analysis_data], axis=1)
174 |
175 | # Drop columns with null values more than 25%
176 | null_percentage = df.isnull().sum() / len(df)
177 | columns_to_drop = null_percentage[null_percentage > 0.25].index
178 | df.drop(columns=columns_to_drop, inplace=True)
179 |
180 | # Fill missing values below 25% with median
181 | for col in df.columns:
182 | if df[col].isnull().sum() > 0: # Check if there are missing values
183 | if null_percentage[col] <= 0.25:
184 | if df[col].dtype in ['float64', 'int64']: # Check if missing values are below 25%
185 | median_value = df[col].median() # Calculate median for the column
186 | df[col].fillna(median_value, inplace=True)
187 |
188 | # Convert object datatype columns to lowercase
189 | for col in df.columns:
190 | if df[col].dtype == 'object': # Check if datatype is object
191 | df[col] = df[col].str.lower() # Convert values to lowercase
192 |
193 | st.write("Cleaned Dataset")
194 | st.dataframe(df)
195 |
196 | st.write("**Multiclass Barplot**")
197 | # Get the names of all columns with data type 'object' (categorical columns)
198 | cat_cols = df.columns.tolist()
199 |
200 | # Get the names of all columns with data type 'object' (categorical variables)
201 | cat_vars = df.select_dtypes(include=['object']).columns.tolist()
202 |
203 | # Exclude 'Country' from the list if it exists in cat_vars
204 | if target_variable in cat_vars:
205 | cat_vars.remove(target_variable)
206 |
207 | # Create a figure with subplots, but only include the required number of subplots
208 | num_cols = len(cat_vars)
209 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots
210 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
211 | axs = axs.flatten()
212 |
213 | # Create a count plot for each categorical variable
214 | for i, var in enumerate(cat_vars):
215 | top_categories = df[var].value_counts().nlargest(10).index
216 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)] # Exclude rows with NaN values in the variable
217 | sns.countplot(x=var, hue=target_variable, data=filtered_df, ax=axs[i])
218 | axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90)
219 |
220 | # Remove any remaining blank subplots
221 | for i in range(num_cols, len(axs)):
222 | fig.delaxes(axs[i])
223 |
224 | # Adjust spacing between subplots
225 | fig.tight_layout()
226 |
227 | # Show plot
228 | st.pyplot(fig)
229 | fig.savefig("plot6.png")
230 |
231 | def to_markdown(text):
232 | text = text.replace('•', ' *')
233 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
234 |
235 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
236 |
237 | import PIL.Image
238 |
239 | img = PIL.Image.open("plot6.png")
240 | model = genai.GenerativeModel('gemini-pro-vision')
241 | response = model.generate_content(img)
242 |
243 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True)
244 | response.resolve()
245 | st.write("**Google Gemini Response About Data**")
246 | st.write(response.text)
247 |
248 |
249 | st.write("**Multiclass Histplot**")
250 | # Get the names of all columns with data type 'object' (categorical columns)
251 | cat_cols = df.columns.tolist()
252 |
253 | # Get the names of all columns with data type 'int'
254 | int_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()
255 | int_vars = [col for col in int_vars if col != target_variable]
256 |
257 | # Create a figure with subplots
258 | num_cols = len(int_vars)
259 | num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots
260 | fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
261 | axs = axs.flatten()
262 |
263 | # Create a histogram for each integer variable with hue='Attrition'
264 | for i, var in enumerate(int_vars):
265 | top_categories = df[var].value_counts().nlargest(10).index
266 | filtered_df = df[df[var].notnull() & df[var].isin(top_categories)]
267 | sns.histplot(data=df, x=var, hue=target_variable, kde=True, ax=axs[i])
268 | axs[i].set_title(var)
269 |
270 | # Remove any extra empty subplots if needed
271 | if num_cols < len(axs):
272 | for i in range(num_cols, len(axs)):
273 | fig.delaxes(axs[i])
274 |
275 | # Adjust spacing between subplots
276 | fig.tight_layout()
277 |
278 | # Show plot
279 | st.pyplot(fig)
280 | fig.savefig("plot7.png")
281 |
282 | def to_markdown(text):
283 | text = text.replace('•', ' *')
284 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
285 |
286 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
287 |
288 | import PIL.Image
289 |
290 | img = PIL.Image.open("plot7.png")
291 | model = genai.GenerativeModel('gemini-pro-vision')
292 | response = model.generate_content(img)
293 |
294 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True)
295 | response.resolve()
296 | st.write("**Google Gemini Response About Data**")
297 | st.write(response.text)
298 |
299 |
300 |
301 |
302 |
303 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/indonesia-bert-sentiment-classification/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "indobenchmark/indobert-base-p1",
3 | "_num_labels": 5,
4 | "architectures": [
5 | "BertForSequenceClassification"
6 | ],
7 | "attention_probs_dropout_prob": 0.1,
8 | "classifier_dropout": null,
9 | "directionality": "bidi",
10 | "gradient_checkpointing": false,
11 | "hidden_act": "gelu",
12 | "hidden_dropout_prob": 0.1,
13 | "hidden_size": 768,
14 | "id2label": {
15 | "0": "LABEL_0",
16 | "1": "LABEL_1",
17 | "2": "LABEL_2"
18 | },
19 | "initializer_range": 0.02,
20 | "intermediate_size": 3072,
21 | "label2id": {
22 | "LABEL_0": 0,
23 | "LABEL_1": 1,
24 | "LABEL_2": 2
25 | },
26 | "layer_norm_eps": 1e-12,
27 | "max_position_embeddings": 512,
28 | "model_type": "bert",
29 | "num_attention_heads": 12,
30 | "num_hidden_layers": 12,
31 | "output_past": true,
32 | "pad_token_id": 0,
33 | "pooler_fc_size": 768,
34 | "pooler_num_attention_heads": 12,
35 | "pooler_num_fc_layers": 3,
36 | "pooler_size_per_head": 128,
37 | "pooler_type": "first_token_transform",
38 | "position_embedding_type": "absolute",
39 | "problem_type": "single_label_classification",
40 | "torch_dtype": "float32",
41 | "transformers_version": "4.10.2",
42 | "type_vocab_size": 2,
43 | "use_cache": true,
44 | "vocab_size": 50000
45 | }
46 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/llmpandas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import streamlit as st
3 | from langchain_groq.chat_models import ChatGroq
4 | from pandasai import SmartDataframe
5 | import os
6 | from PIL import Image
7 | import textwrap
8 | import google.generativeai as genai
9 | from IPython.display import display
10 | from IPython.display import Markdown
11 | import io
12 | import matplotlib.pyplot as plt
13 |
14 | # Load language model
15 | llm = ChatGroq(
16 | model_name="mixtral-8x7b-32768",
17 | api_key="YOUR_GROQ_API")
18 |
19 | def main():
20 | st.title("Ask your CSV")
21 |
22 | # Allow user to upload CSV file
23 | uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
24 |
25 | if uploaded_file is not None:
26 | # Read uploaded CSV file into pandas DataFrame
27 | data = pd.read_csv(uploaded_file)
28 | st.dataframe(data)
29 |
30 | # Convert DataFrame into SmartDataFrame
31 | df = SmartDataframe(data, config={"llm": llm})
32 |
33 | # Add text box for user input
34 | question = st.text_input("Ask a question about the data:")
35 |
36 | if st.button("Ask"):
37 | if question:
38 | # Answer the user's question using the language model
39 | answer = df.chat(question)
40 |
41 | # Display the answer
42 | st.write("Answer:", answer)
43 |
44 | # Check if the answer is a visualization
45 | if isinstance(answer, str) and os.path.exists(answer):
46 | # Open the image file
47 | image = Image.open(answer)
48 | # Display the image
49 | st.image(image, caption="Visualization")
50 |
51 | # Save the figure as result.png
52 | plt.savefig("result.png")
53 |
54 | # Generate content using Google Gemini
55 | def to_markdown(text):
56 | text = text.replace('•', ' *')
57 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
58 |
59 | genai.configure(api_key="YOUR_GOOGLE_GEMINI_API")
60 | model = genai.GenerativeModel('gemini-pro-vision')
61 |
62 | img1 = Image.open("result.png")
63 | response = model.generate_content(["You are a Professional Data Analyst, give a conclusion and actionable insight based on the visualization", img1], stream=True)
64 | response.resolve()
65 |
66 | st.write("**Google Gemini Response About Data**")
67 | st.write(response.text)
68 | else:
69 | st.warning("No visualization found.")
70 | else:
71 | st.warning("Please ask a question.")
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
76 |
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/pdf_comparer.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import seaborn as sns
3 | import matplotlib.pyplot as plt
4 | import PyPDF2
5 | sns.set_theme(color_codes=True)
6 | import pandas as pd
7 | from io import StringIO
8 | import re
9 | import os
10 | import pathlib
11 | import textwrap
12 | import google.generativeai as genai
13 | from IPython.display import display
14 | from IPython.display import Markdown
15 | # import StemmerFactory class
16 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
17 | # create stemmer
18 | factory = StemmerFactory()
19 | stemmer = factory.create_stemmer()
20 |
21 | st.title("PDF Document Comparison")
22 |
23 | additional_stopwords = st.text_input("Enter additional stopwords (comma-separated)", value="")
24 | additional_stopwords = additional_stopwords.split(",")
25 |
26 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
27 | stop_factory = StopWordRemoverFactory()
28 | more_stopword = ['dengan', 'ia','bahwa','oleh','rp','undang','pasal','ayat','bab']
29 | data = stop_factory.get_stop_words()+more_stopword + additional_stopwords
30 | stopword = stop_factory.create_stop_word_remover()
31 |
32 | # Function to read PDF and return string
33 | def read_pdf(file):
34 | # Create a PyPDF2 reader object
35 | pdf_reader = PyPDF2.PdfFileReader(file)
36 |
37 | # Extract text from all pages of PDF
38 | text = ""
39 | for page in range(pdf_reader.getNumPages()):
40 | text += pdf_reader.getPage(page).extractText()
41 |
42 | # Return the text as a string
43 | return text
44 |
45 | # Upload PDF file
46 | file = st.file_uploader("Upload a PDF file", type="pdf", key='text1')
47 |
48 | # If file is uploaded
49 | if file is not None:
50 | # Call read_pdf function to convert PDF to string
51 | text1 = read_pdf(file)
52 |
53 |
54 | # Function to read PDF and return string
55 | def read_pdf(file):
56 | # Create a PyPDF2 reader object
57 | pdf_reader = PyPDF2.PdfFileReader(file)
58 |
59 | # Extract text from all pages of PDF
60 | text = ""
61 | for page in range(pdf_reader.getNumPages()):
62 | text += pdf_reader.getPage(page).extractText()
63 |
64 | # Return the text as a string
65 | return text
66 |
67 | # Upload PDF file
68 | file = st.file_uploader("Upload a PDF file", type="pdf", key='text2')
69 |
70 | # If file is uploaded
71 | if file is not None:
72 | # Call read_pdf function to convert PDF to string
73 | text2 = read_pdf(file)
74 |
75 |
76 | if st.button("Process"):
77 |
78 | sentence1 = text1
79 | output1 = stemmer.stem(sentence1)
80 |
81 | hasil1 = re.sub(r"\d+", "", output1)
82 | hasil1 = re.sub(r'[^a-zA-Z\s]','',output1)
83 |
84 | pattern = re.compile(r'\b(' + r'|'.join(data) + r')\b\s*')
85 | hasil1 = pattern.sub('', hasil1)
86 |
87 |
88 | sentence2 = text2
89 | output2 = stemmer.stem(sentence2)
90 |
91 | hasil2 = re.sub(r"\d+", "", output2)
92 | hasil2 = re.sub(r'[^a-zA-Z\s]','',output2)
93 |
94 | pattern = re.compile(r'\b(' + r'|'.join(data) + r')\b\s*')
95 | hasil2 = pattern.sub('', hasil2)
96 |
97 | documents = [hasil1, hasil2]
98 | from sklearn.feature_extraction.text import CountVectorizer
99 | import pandas as pd
100 |
101 | # Create the Document Term Matrix
102 | count_vectorizer = CountVectorizer(stop_words='english')
103 | count_vectorizer = CountVectorizer()
104 | sparse_matrix = count_vectorizer.fit_transform(documents)
105 | from sklearn.metrics.pairwise import cosine_similarity
106 | cosine_sim = cosine_similarity(sparse_matrix, sparse_matrix)
107 |
108 |
109 | plt.rcParams.update({'font.size': 26})
110 |
111 | heatmap = plt.figure(figsize =(5, 5))
112 | sns.heatmap(cosine_sim, fmt='.2g', annot=True)
113 |
114 |
115 | import matplotlib.pyplot as plt
116 | from wordcloud import WordCloud
117 |
118 | # Create a WordCloud object
119 | wordcloud = WordCloud(min_font_size=3,max_words=200,width=1600,height=720,
120 | colormap = 'Set2', background_color='white').generate(hasil1)
121 |
122 | # Display the WordCloud using Matplotlib and Streamlit
123 | fig, ax = plt.subplots()
124 | ax.imshow(wordcloud, interpolation='bilinear')
125 | ax.axis('off')
126 |
127 |
128 | # Create a WordCloud object
129 | wordcloud = WordCloud(min_font_size=3,max_words=200,width=1600,height=720,
130 | colormap = 'Set2', background_color='white').generate(hasil2)
131 |
132 | # Display the WordCloud using Matplotlib and Streamlit
133 | fig2, ax = plt.subplots()
134 | ax.imshow(wordcloud, interpolation='bilinear')
135 | ax.axis('off')
136 |
137 |
138 | str=hasil1+hasil2
139 | # Create a WordCloud object
140 | wordcloud = WordCloud(min_font_size=3,max_words=200,width=1600,height=720,
141 | colormap = 'Set2', background_color='white').generate(str)
142 |
143 | # Display the WordCloud using Matplotlib and Streamlit
144 | fig3, ax = plt.subplots()
145 | ax.imshow(wordcloud, interpolation='bilinear')
146 | ax.axis('off')
147 |
148 |
149 |
150 | #bigram visualization
151 | import collections
152 | # Get bigrams
153 | words1 = hasil1.split()
154 | bigrams = list(zip(words1, words1[1:]))
155 |
156 | # Count bigrams
157 | bigram_counts = collections.Counter(bigrams)
158 |
159 | # Get top 10 bigram counts
160 | top_bigrams = dict(bigram_counts.most_common(10))
161 |
162 | # Create bar chart
163 | plt.rcParams.update({'font.size': 12})
164 | fig4, ax = plt.subplots()
165 | ax.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
166 | ax.set_xticks(range(len(top_bigrams)))
167 | ax.set_xticklabels(list(top_bigrams.keys()))
168 | ax.set_xlabel('Bigram Words')
169 | ax.set_ylabel('Count')
170 | ax.set_title('Top 10 Bigram Word Counts')
171 | plt.xticks(rotation=90)
172 | plt.figure(figsize =(15, 15))
173 |
174 |
175 |
176 |
177 | #bigram visualization
178 | import collections
179 | # Get bigrams
180 | words2 = hasil2.split()
181 | bigrams = list(zip(words2, words2[1:]))
182 |
183 | # Count bigrams
184 | bigram_counts = collections.Counter(bigrams)
185 |
186 | # Get top 10 bigram counts
187 | top_bigrams = dict(bigram_counts.most_common(10))
188 |
189 | # Create bar chart
190 | plt.rcParams.update({'font.size': 12})
191 | fig5, ax = plt.subplots()
192 | ax.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
193 | ax.set_xticks(range(len(top_bigrams)))
194 | ax.set_xticklabels(list(top_bigrams.keys()))
195 | ax.set_xlabel('Bigram Words')
196 | ax.set_ylabel('Count')
197 | ax.set_title('Top 10 Bigram Word Counts')
198 | plt.xticks(rotation=90)
199 | plt.figure(figsize =(15, 15))
200 |
201 | st.write("**Accuracy**")
202 | st.write(heatmap)
203 |
204 | st.write("**WordCloud Document 1**")
205 | st.pyplot(fig)
206 |
207 | st.write("**WordCloud Document 2**")
208 | st.pyplot(fig2)
209 |
210 | st.write("**WordCloud From Both Documents**")
211 | st.pyplot(fig3)
212 |
213 | st.write("**Bi-Gram for Document 1**")
214 | st.pyplot(fig4)
215 |
216 | st.write("**Bi-Gram for Document 2**")
217 | st.pyplot(fig5)
218 |
219 |
220 | def to_markdown(text):
221 | text = text.replace('•', ' *')
222 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
223 |
224 | # Configure genai with API key
225 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA")
226 |
227 | # Instantiate the model
228 | model = genai.GenerativeModel('gemini-1.0-pro-latest')
229 |
230 | # Generate content
231 | response = model.generate_content(["Compare the simmilarities and give some conclusion between these 2 PDF Document : ", hasil1, "and", hasil2], stream=True)
232 | response.resolve()
233 | st.write("**Google Gemini Response About Data**")
234 | st.write(response.text)
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/pdf_document_analysis.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
4 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
5 | from wordcloud import WordCloud
6 | import PyPDF2
7 | import re
8 | from io import StringIO
9 | import plotly.express as px
10 | import pandas as pd
11 | import collections
12 | import seaborn as sns
13 | sns.set_theme(color_codes=True)
14 | import os
15 | import pathlib
16 | import textwrap
17 | import google.generativeai as genai
18 | from IPython.display import display
19 | from IPython.display import Markdown
20 | import PIL.Image
21 | import matplotlib.pyplot as plt
22 |
23 | st.title("NLP : PDF Document Analysis")
24 | st.set_option('deprecation.showPyplotGlobalUse', False)
25 |
26 | # Function to convert text to Markdown format
27 | def to_markdown(text):
28 | text = text.replace('•', ' *')
29 | return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
30 |
31 | # Create stemmer
32 | factory = StemmerFactory()
33 | stemmer = factory.create_stemmer()
34 |
35 | # Create stopword remover
36 | stop_factory = StopWordRemoverFactory()
37 | more_stopword = ['dengan', 'ia', 'bahwa', 'oleh', 'rp', 'undang', 'pasal', 'ayat', 'bab']
38 | data = stop_factory.get_stop_words() + more_stopword
39 |
40 | # User input for custom stopwords
41 | custom_stopwords = st.text_input("Enter custom stopwords (comma-separated):")
42 | if custom_stopwords:
43 | custom_stopword_list = [word.strip() for word in custom_stopwords.split(",")]
44 | data.extend(custom_stopword_list)
45 |
46 | # Function to read PDF and return string
47 | def read_pdf(file):
48 | pdf_reader = PyPDF2.PdfFileReader(file)
49 | text = ""
50 | for page in range(pdf_reader.getNumPages()):
51 | text += pdf_reader.getPage(page).extractText()
52 | return text
53 |
54 | # Upload PDF file
55 | file = st.file_uploader("Upload a PDF file", type="pdf", key='text1')
56 |
57 | # If file is uploaded
58 | if file is not None:
59 | # Call read_pdf function to convert PDF to string
60 | text1 = read_pdf(file)
61 |
62 | # Stem and preprocess the text
63 | sentence1 = text1
64 | output1 = stemmer.stem(sentence1)
65 | hasil1 = re.sub(r"\d+", "", output1)
66 | hasil1 = re.sub(r'[^a-zA-Z\s]', '', hasil1)
67 | pattern = re.compile(r'\b(' + r'|'.join(data) + r')\b\s*')
68 | hasil1 = pattern.sub('', hasil1)
69 |
70 | # Create WordCloud
71 | wordcloud = WordCloud(
72 | min_font_size=3, max_words=200, width=800, height=400,
73 | colormap='Set2', background_color='white'
74 | ).generate(hasil1)
75 |
76 | # Save the WordCloud image
77 | wordcloud_file = "wordcloud.png"
78 | wordcloud.to_file(wordcloud_file)
79 |
80 | # Display the WordCloud using Streamlit
81 | st.subheader(f"Wordcloud Visualization")
82 | st.image(wordcloud_file)
83 |
84 | # Use Google Gemini API to generate content based on the uploaded image
85 | st.subheader("Google Gemini Response")
86 |
87 | # Load the image
88 | img = PIL.Image.open(wordcloud_file)
89 |
90 | # Configure and use the GenerativeAI model
91 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
92 | model = genai.GenerativeModel('gemini-pro-vision')
93 | response = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True)
94 | response.resolve()
95 |
96 | # Display Gemini API response in Markdown format
97 | st.write(response.text)
98 |
99 | # Use Google Gemini API to generate content based on the WordCloud image
100 | genai.configure(api_key="AIzaSyDU0F3ZmGWBrrFpmUv21ZHuJBoTbtm4mL8")
101 | model = genai.GenerativeModel('gemini-pro-vision')
102 | response_gemini = model.generate_content(["You are a professional Data Analyst, write the complete conclusion and actionable insight based on the image", img], stream=True)
103 | response_gemini.resolve()
104 |
105 | # Bigram visualization
106 | # Get bigrams
107 | words1 = hasil1.split()
108 | # Get bigrams
109 | bigrams = list(zip(words1, words1[1:]))
110 |
111 | # Count bigrams
112 | bigram_counts = collections.Counter(bigrams)
113 |
114 | # Get top 10 bigram counts
115 | top_bigrams = dict(bigram_counts.most_common(10))
116 |
117 | # Create bar chart
118 | plt.figure(figsize=(10, 7))
119 | plt.bar(range(len(top_bigrams)), list(top_bigrams.values()), align='center')
120 | plt.xticks(range(len(top_bigrams)), list(top_bigrams.keys()), rotation=90)
121 | plt.xlabel('Bigram Words')
122 | plt.ylabel('Count')
123 | plt.title(f"Top 10 Bigram from PDF Document")
124 |
125 | # Add Gemini response text to the plot
126 | gemini_response_text = response_gemini.text
127 |
128 | # Save the entire plot as a PNG
129 | plt.tight_layout()
130 | plt.savefig("bigram_with_gemini_response.png")
131 |
132 | # Display the plot and Gemini response in Streamlit
133 | st.subheader("Bigram for PDF Document")
134 | st.image("bigram_with_gemini_response.png")
135 | st.subheader("Google Gemini Response")
136 | st.write(gemini_response_text)
137 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/table_scraper_analysis.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import pandas as pd
5 | import pathlib
6 | import textwrap
7 | import google.generativeai as genai
8 | from IPython.display import display
9 | from IPython.display import Markdown
10 |
11 | def scrape_tables(url):
12 | """
13 | Scrapes all tables from a given URL and returns them as a list of DataFrames.
14 |
15 | Args:
16 | url: The URL of the webpage to scrape.
17 |
18 | Returns:
19 | A list of pandas DataFrames, each representing a scraped table.
20 | """
21 | # Fetch the HTML content
22 | response = requests.get(url)
23 | response.raise_for_status() # Raise an error if the request fails
24 |
25 | # Parse the HTML content
26 | soup = BeautifulSoup(response.content, "html.parser")
27 |
28 | # Find all tables
29 | tables = soup.find_all("table")
30 |
31 | # Extract data and convert to DataFrames
32 | all_dataframes = []
33 | for table in tables:
34 | # Extract rows from the table
35 | rows = table.find_all("tr")
36 | table_data = []
37 | for row in rows:
38 | # Extract cells from each row
39 | cells = row.find_all(["th", "td"]) # Consider both headers and data cells
40 | row_data = [cell.text.strip() for cell in cells] # Extract text and strip whitespace
41 | table_data.append(row_data)
42 |
43 | # Check if there's data before creating a DataFrame
44 | if table_data:
45 | df = pd.DataFrame(table_data)
46 | all_dataframes.append(df)
47 |
48 | return all_dataframes
49 |
50 | def display_and_modify_tables(dataframes):
51 | """
52 | Displays scraped DataFrames in Streamlit and allows user interaction for modifications.
53 |
54 | Args:
55 | dataframes: A list of pandas DataFrames containing scraped data.
56 | """
57 | # Display all scraped tables (head)
58 | if dataframes:
59 | st.subheader("Scraped Tables:")
60 | for i, df in enumerate(dataframes):
61 | st.write(f"Table {i+1}")
62 | st.dataframe(df.head()) # Show only the head (first few rows)
63 |
64 | # Table selection for modification
65 | selected_table_index = st.selectbox("Select a Table to Modify", range(len(dataframes)))
66 | selected_df = dataframes[selected_table_index]
67 |
68 | # Display the full selected table
69 | st.subheader(f"Selected Table {selected_table_index+1}")
70 | st.dataframe(selected_df)
71 |
72 | # Row selection for removal with multi-select
73 | rows_to_remove = st.multiselect("Select rows to remove (0-based):", selected_df.index.tolist(), key="rows_to_remove")
74 |
75 | # Combined button for row removal with confirmation
76 | if st.button("Remove Selected Rows"):
77 | if rows_to_remove: # Check if any rows were selected
78 | try:
79 | selected_df.drop(rows_to_remove, axis=0, inplace=True) # Remove rows
80 | st.success(f"Selected rows removed successfully!")
81 | # Display the modified DataFrame
82 | st.subheader(f"Modified Table {selected_table_index+1}")
83 | st.dataframe(selected_df)
84 | except Exception as e:
85 | st.error(f"Error removing rows: {e}")
86 |
87 | # --- Google Gemini Integration ---
88 | # Convert the DataFrame to a string variable
89 | df_string = selected_df.to_string()
90 |
91 | # Configure genai with API key (replace with your actual key)
92 | genai.configure(api_key="AIzaSyB2sQh_oHbFULJ7x2vixJWAboPpPvrCKoA") # Replace with your Google GenerativeAI API key
93 |
94 | model = genai.GenerativeModel('gemini-1.0-pro-latest')
95 |
96 | try:
97 | # Generate content with Gemini
98 | response = model.generate_content(["You are a Professional Data Analyst, Make a Summary and actionable insight based on the csv dataset here :", df_string], stream=True)
99 | response.resolve()
100 | st.write("**Google Gemini Response About Data**")
101 | st.write(response.text)
102 | except Exception as e:
103 | st.error(f"Error generating content with Google Gemini: {e}")
104 |
105 |
106 | # Streamlit app
107 | st.title("Table Scraper and Modifier App")
108 | url = st.text_input("Enter the URL to scrape:")
109 | if url:
110 | try:
111 | scraped_dataframes = scrape_tables(url)
112 | display_and_modify_tables(scraped_dataframes)
113 | except requests.exceptions.RequestException as e:
114 | st.error(f"An error occurred scraping the URL: {e}")
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/Streamlit-Web-Application-main/web_scrape.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import spacy
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | import seaborn as sns
8 | sns.set_theme(color_codes=True)
9 |
10 | st.title("Web Article Summarizer")
11 |
12 | target_url = st.text_input("Enter the target URL:")
13 | process_button = st.button("Scrape Text") # Button text adjusted
14 |
15 | def scrape_text(url):
16 | """Scrapes text from a website and returns the extracted text.
17 |
18 | Args:
19 | url: The URL of the website to scrape.
20 |
21 | Returns:
22 | The scraped text content as a string, or None if there's an error.
23 | """
24 |
25 | if not url: # Check if URL is empty
26 | return None
27 |
28 | try:
29 | # Send HTTP request and parse HTML content
30 | response = requests.get(url)
31 | soup = BeautifulSoup(response.content, "html.parser")
32 |
33 | # Extract text based on your desired method (modify as needed)
34 | # Here, we're extracting text from all paragraphs
35 | paragraphs = soup.find_all("p")
36 | paragraph_text = []
37 | for paragraph in paragraphs[:2]: # Limit to first 2 paragraphs
38 | paragraph_text.append(paragraph.text.strip())
39 |
40 | # Combine text from all paragraphs (limited to first 2)
41 | all_paragraph_text = "\n".join(paragraph_text)
42 |
43 | return all_paragraph_text
44 | except Exception as e:
45 | st.error(f"Error scraping text: {e}")
46 | return None
47 |
48 | if process_button: # Only execute if button is clicked
49 | scraped_text = scrape_text(target_url)
50 |
51 | if scraped_text:
52 | st.success("Text scraped successfully!")
53 | st.subheader("Showing First Paragraphs of Article:")
54 | st.write(scraped_text) # Show only the first 2 paragraphs
55 |
56 | # Load English tokenizer, tagger, parser and NER
57 | nlp = spacy.load("en_core_web_sm")
58 |
59 | # Process the scraped text
60 | doc = nlp(scraped_text)
61 |
62 | # Analyze syntax - Extract Noun Phrases
63 | noun_phrases = [chunk.text for chunk in doc.noun_chunks]
64 |
65 | # Create DataFrame using Pandas (alternative to columns argument)
66 | noun_phrases_df = pd.DataFrame(noun_phrases, columns=["Noun Phrase"]) # Create DataFrame with Pandas
67 |
68 | # Display Noun Phrases in Streamlit table
69 | st.subheader("Noun Phrases:")
70 | st.dataframe(noun_phrases_df)
71 |
72 | # Analyze syntax - Extract Verbs
73 | verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
74 |
75 | # Create DataFrame for Verbs
76 | verbs_df = pd.DataFrame(verbs, columns=["Verb"])
77 |
78 | # Display Verbs in Streamlit table
79 | st.subheader("Verbs:")
80 | st.dataframe(verbs_df)
81 |
82 |
83 | # Analyze Part-of-Speech Distribution
84 | pos_counts = {token.pos_: 0 for token in doc}
85 | for token in doc:
86 | pos_counts[token.pos_] += 1
87 |
88 | # Create Part-of-Speech Distribution Plot (using matplotlib)
89 | plt.figure(figsize=(8, 6))
90 | plt.bar(pos_counts.keys(), pos_counts.values())
91 | plt.xlabel("Part of Speech")
92 | plt.ylabel("Count")
93 | plt.xticks(rotation=45)
94 | plt.tight_layout()
95 |
96 | # Display Part-of-Speech Distribution Plot in Streamlit
97 | st.subheader("Part-of-Speech Distribution :")
98 | st.pyplot(plt)
99 |
100 | else:
101 | st.warning("No text found on the provided URL or an error occurred.")
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/Tableau/Dashboard 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MagicDash91/All-of-Data-Science-Project/734e54ff951d39fb8d7ba007dcc9c82859ac7ec6/Tableau/Dashboard 1.png
--------------------------------------------------------------------------------