├── requirements.txt
├── README.md
├── LICENSE
└── app.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.25.0
 2 | numpy==1.23.5
 3 | pandas==2.0.3
 4 | wordcloud==1.9.2
 5 | matplotlib==3.7.1
 6 | PyPDF2==3.0.1
 7 | python-docx==0.8.11
 8 | plotly==5.15.0
 9 | 
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # word_cloud_app_by_aammar
2 | This repository is about an application to make a world cloud from any kind of document/s. 
3 | 
4 | 
5 | # Kaisa phir maza aya github ka k nahi? Coding ki zaroorat hi nahi hy
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Muhammad Aammar Tufail
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | from wordcloud import WordCloud, STOPWORDS
  5 | import matplotlib.pyplot as plt
  6 | import PyPDF2
  7 | from docx import Document  
  8 | import plotly.express as px
  9 | import base64
 10 | from io import BytesIO
 11 | 
 12 | # Functions for file reading
 13 | def read_txt(file):
 14 |     return file.getvalue().decode("utf-8")
 15 | 
 16 | def read_docx(file):
 17 |     doc = Document(file)
 18 |     return " ".join([para.text for para in doc.paragraphs])
 19 | 
 20 | def read_pdf(file):
 21 |     pdf = PyPDF2.PdfReader(file)
 22 |     return " ".join([page.extract_text() for page in pdf.pages])
 23 | 
 24 | # Function to filter out stopwords
 25 | def filter_stopwords(text, additional_stopwords=[]):
 26 |     words = text.split()
 27 |     all_stopwords = STOPWORDS.union(set(additional_stopwords))
 28 |     filtered_words = [word for word in words if word.lower() not in all_stopwords]
 29 |     return " ".join(filtered_words)
 30 | 
 31 | # Function to create download link for plot
 32 | def get_image_download_link(buffered, format_):
 33 |     image_base64 = base64.b64encode(buffered.getvalue()).decode()
 34 |     return f'<a href="data:image/{format_};base64,{image_base64}" download="wordcloud.{format_}">Download Plot as {format_}</a>'
 35 | 
 36 | # Function to generate a download link for a DataFrame
 37 | def get_table_download_link(df, filename, file_label):
 38 |     csv = df.to_csv(index=False)
 39 |     b64 = base64.b64encode(csv.encode()).decode()
 40 |     return f'<a href="data:file/csv;base64,{b64}" download="{filename}">{file_label}</a>'
 41 | 
 42 | # Streamlit code
 43 | st.title("Word Cloud Generator")
 44 | st.subheader("📁 Upload a pdf, docx or text file to generate a word cloud")
 45 | 
 46 | uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
 47 | st.set_option('deprecation.showPyplotGlobalUse', False)
 48 | 
 49 | if uploaded_file:
 50 |     file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
 51 |     st.write(file_details)
 52 | 
 53 |     # Check the file type and read the file
 54 |     if uploaded_file.type == "text/plain":
 55 |         text = read_txt(uploaded_file)
 56 |     elif uploaded_file.type == "application/pdf":
 57 |         text = read_pdf(uploaded_file)
 58 |     elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
 59 |         text = read_docx(uploaded_file)
 60 |     else:
 61 |         st.write("This file type is not supported yet.")
 62 |         text = ""
 63 | 
 64 |     # Generate word count table
 65 |     words = text.split()
 66 |     word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
 67 | 
 68 |     # Sidebar: Checkbox and Multiselect box for stopwords
 69 |     use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True)
 70 |     top_words = word_count['Word'].head(50).tolist()
 71 |     additional_stopwords = st.sidebar.multiselect("Additional stopwords:", sorted(top_words))
 72 | 
 73 |     if use_standard_stopwords:
 74 |         all_stopwords = STOPWORDS.union(set(additional_stopwords))
 75 |     else:
 76 |         all_stopwords = set(additional_stopwords)
 77 | 
 78 |     text = filter_stopwords(text, all_stopwords)
 79 | 
 80 |     if text:
 81 |         # Word Cloud dimensions
 82 |         width = st.sidebar.slider("Select Word Cloud Width", 400, 2000, 1200, 50)
 83 |         height = st.sidebar.slider("Select Word Cloud Height", 200, 2000, 800, 50)
 84 | 
 85 |         # Generate wordcloud
 86 |         st.subheader("Generated Word Cloud")
 87 |         fig, ax = plt.subplots(figsize=(width/100, height/100))  # Convert pixels to inches for figsize
 88 |         wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200, contour_width=3, contour_color='steelblue').generate(text)
 89 |         ax.imshow(wordcloud_img, interpolation='bilinear')
 90 |         ax.axis('off')
 91 | 
 92 |         # Save plot functionality
 93 |         format_ = st.selectbox("Select file format to save the plot", ["png", "jpeg", "svg", "pdf"])
 94 |         resolution = st.slider("Select Resolution", 100, 500, 300, 50)
 95 |         # Generate word count table
 96 |         st.subheader("Word Count Table")
 97 |         words = text.split()
 98 |         word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
 99 |         st.write(word_count)
100 |     st.pyplot(fig)
101 |     if st.button(f"Save as {format_}"):
102 |         buffered = BytesIO()
103 |         plt.savefig(buffered, format=format_, dpi=resolution)
104 |         st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True)
105 |     
106 |     
107 |     # Word count table at the end
108 |     st.sidebar.markdown("---")
109 |     st.sidebar.subheader("Subscribe to our Youtube Channel to learn Data Science in Urdu/Hindi")
110 |     # add a youtube video
111 |     st.sidebar.video("https://youtu.be/omk5b1m2h38")
112 |     st.sidebar.markdown("---")
113 |     # add author name and info
114 |     st.sidebar.markdown("Created by: [Dr. Muhammad Aammar Tufail](https://github.com/AammarTufail)")
115 |     st.sidebar.markdown("Contact: [Email](mailto:aammar@codanics.com)")
116 |     
117 |     
118 |     st.subheader("Word Count Table")
119 |     st.write(word_count)
120 |     # Provide download link for table
121 |     if st.button('Download Word Count Table as CSV'):
122 |         st.markdown(get_table_download_link(word_count, "word_count.csv", "Click Here to Download"), unsafe_allow_html=True)
123 | 


--------------------------------------------------------------------------------