├── requirements.txt
├── README.md
├── LICENSE
└── app.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.25.0
2 | numpy==1.23.5
3 | pandas==2.0.3
4 | wordcloud==1.9.2
5 | matplotlib==3.7.1
6 | PyPDF2==3.0.1
7 | python-docx==0.8.11
8 | plotly==5.15.0
9 |
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # word_cloud_app_by_aammar
2 | This repository is about an application to make a world cloud from any kind of document/s.
3 |
4 |
5 | # Kaisa phir maza aya github ka k nahi? Coding ki zaroorat hi nahi hy
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Muhammad Aammar Tufail
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import numpy as np
4 | from wordcloud import WordCloud, STOPWORDS
5 | import matplotlib.pyplot as plt
6 | import PyPDF2
7 | from docx import Document
8 | import plotly.express as px
9 | import base64
10 | from io import BytesIO
11 |
12 | # Functions for file reading
13 | def read_txt(file):
14 | return file.getvalue().decode("utf-8")
15 |
16 | def read_docx(file):
17 | doc = Document(file)
18 | return " ".join([para.text for para in doc.paragraphs])
19 |
20 | def read_pdf(file):
21 | pdf = PyPDF2.PdfReader(file)
22 | return " ".join([page.extract_text() for page in pdf.pages])
23 |
24 | # Function to filter out stopwords
25 | def filter_stopwords(text, additional_stopwords=[]):
26 | words = text.split()
27 | all_stopwords = STOPWORDS.union(set(additional_stopwords))
28 | filtered_words = [word for word in words if word.lower() not in all_stopwords]
29 | return " ".join(filtered_words)
30 |
31 | # Function to create download link for plot
32 | def get_image_download_link(buffered, format_):
33 | image_base64 = base64.b64encode(buffered.getvalue()).decode()
34 | return f'Download Plot as {format_}'
35 |
36 | # Function to generate a download link for a DataFrame
37 | def get_table_download_link(df, filename, file_label):
38 | csv = df.to_csv(index=False)
39 | b64 = base64.b64encode(csv.encode()).decode()
40 | return f'{file_label}'
41 |
42 | # Streamlit code
43 | st.title("Word Cloud Generator")
44 | st.subheader("📁 Upload a pdf, docx or text file to generate a word cloud")
45 |
46 | uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
47 | st.set_option('deprecation.showPyplotGlobalUse', False)
48 |
49 | if uploaded_file:
50 | file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
51 | st.write(file_details)
52 |
53 | # Check the file type and read the file
54 | if uploaded_file.type == "text/plain":
55 | text = read_txt(uploaded_file)
56 | elif uploaded_file.type == "application/pdf":
57 | text = read_pdf(uploaded_file)
58 | elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
59 | text = read_docx(uploaded_file)
60 | else:
61 | st.write("This file type is not supported yet.")
62 | text = ""
63 |
64 | # Generate word count table
65 | words = text.split()
66 | word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
67 |
68 | # Sidebar: Checkbox and Multiselect box for stopwords
69 | use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True)
70 | top_words = word_count['Word'].head(50).tolist()
71 | additional_stopwords = st.sidebar.multiselect("Additional stopwords:", sorted(top_words))
72 |
73 | if use_standard_stopwords:
74 | all_stopwords = STOPWORDS.union(set(additional_stopwords))
75 | else:
76 | all_stopwords = set(additional_stopwords)
77 |
78 | text = filter_stopwords(text, all_stopwords)
79 |
80 | if text:
81 | # Word Cloud dimensions
82 | width = st.sidebar.slider("Select Word Cloud Width", 400, 2000, 1200, 50)
83 | height = st.sidebar.slider("Select Word Cloud Height", 200, 2000, 800, 50)
84 |
85 | # Generate wordcloud
86 | st.subheader("Generated Word Cloud")
87 | fig, ax = plt.subplots(figsize=(width/100, height/100)) # Convert pixels to inches for figsize
88 | wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200, contour_width=3, contour_color='steelblue').generate(text)
89 | ax.imshow(wordcloud_img, interpolation='bilinear')
90 | ax.axis('off')
91 |
92 | # Save plot functionality
93 | format_ = st.selectbox("Select file format to save the plot", ["png", "jpeg", "svg", "pdf"])
94 | resolution = st.slider("Select Resolution", 100, 500, 300, 50)
95 | # Generate word count table
96 | st.subheader("Word Count Table")
97 | words = text.split()
98 | word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False)
99 | st.write(word_count)
100 | st.pyplot(fig)
101 | if st.button(f"Save as {format_}"):
102 | buffered = BytesIO()
103 | plt.savefig(buffered, format=format_, dpi=resolution)
104 | st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True)
105 |
106 |
107 | # Word count table at the end
108 | st.sidebar.markdown("---")
109 | st.sidebar.subheader("Subscribe to our Youtube Channel to learn Data Science in Urdu/Hindi")
110 | # add a youtube video
111 | st.sidebar.video("https://youtu.be/omk5b1m2h38")
112 | st.sidebar.markdown("---")
113 | # add author name and info
114 | st.sidebar.markdown("Created by: [Dr. Muhammad Aammar Tufail](https://github.com/AammarTufail)")
115 | st.sidebar.markdown("Contact: [Email](mailto:aammar@codanics.com)")
116 |
117 |
118 | st.subheader("Word Count Table")
119 | st.write(word_count)
120 | # Provide download link for table
121 | if st.button('Download Word Count Table as CSV'):
122 | st.markdown(get_table_download_link(word_count, "word_count.csv", "Click Here to Download"), unsafe_allow_html=True)
123 |
--------------------------------------------------------------------------------