├── requirements.txt ├── README.md ├── LICENSE └── app.py /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.25.0 2 | numpy==1.23.5 3 | pandas==2.0.3 4 | wordcloud==1.9.2 5 | matplotlib==3.7.1 6 | PyPDF2==3.0.1 7 | python-docx==0.8.11 8 | plotly==5.15.0 9 | 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # word_cloud_app_by_aammar 2 | This repository is about an application to make a world cloud from any kind of document/s. 3 | 4 | 5 | # Kaisa phir maza aya github ka k nahi? Coding ki zaroorat hi nahi hy 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Muhammad Aammar Tufail 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | from wordcloud import WordCloud, STOPWORDS 5 | import matplotlib.pyplot as plt 6 | import PyPDF2 7 | from docx import Document 8 | import plotly.express as px 9 | import base64 10 | from io import BytesIO 11 | 12 | # Functions for file reading 13 | def read_txt(file): 14 | return file.getvalue().decode("utf-8") 15 | 16 | def read_docx(file): 17 | doc = Document(file) 18 | return " ".join([para.text for para in doc.paragraphs]) 19 | 20 | def read_pdf(file): 21 | pdf = PyPDF2.PdfReader(file) 22 | return " ".join([page.extract_text() for page in pdf.pages]) 23 | 24 | # Function to filter out stopwords 25 | def filter_stopwords(text, additional_stopwords=[]): 26 | words = text.split() 27 | all_stopwords = STOPWORDS.union(set(additional_stopwords)) 28 | filtered_words = [word for word in words if word.lower() not in all_stopwords] 29 | return " ".join(filtered_words) 30 | 31 | # Function to create download link for plot 32 | def get_image_download_link(buffered, format_): 33 | image_base64 = base64.b64encode(buffered.getvalue()).decode() 34 | return f'Download Plot as {format_}' 35 | 36 | # Function to generate a download link for a DataFrame 37 | def get_table_download_link(df, filename, file_label): 38 | csv = df.to_csv(index=False) 39 | b64 = base64.b64encode(csv.encode()).decode() 40 | return f'{file_label}' 41 | 42 | # Streamlit code 43 | st.title("Word Cloud Generator") 44 | st.subheader("📁 Upload a pdf, docx or text file to generate a word cloud") 45 | 46 | uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"]) 47 | st.set_option('deprecation.showPyplotGlobalUse', False) 48 | 49 | if uploaded_file: 50 | file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size} 51 | st.write(file_details) 52 | 53 | # Check the file type and read the file 54 | if uploaded_file.type == "text/plain": 55 | text = read_txt(uploaded_file) 56 | elif uploaded_file.type == "application/pdf": 57 | text = read_pdf(uploaded_file) 58 | elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 59 | text = read_docx(uploaded_file) 60 | else: 61 | st.write("This file type is not supported yet.") 62 | text = "" 63 | 64 | # Generate word count table 65 | words = text.split() 66 | word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False) 67 | 68 | # Sidebar: Checkbox and Multiselect box for stopwords 69 | use_standard_stopwords = st.sidebar.checkbox("Use standard stopwords?", True) 70 | top_words = word_count['Word'].head(50).tolist() 71 | additional_stopwords = st.sidebar.multiselect("Additional stopwords:", sorted(top_words)) 72 | 73 | if use_standard_stopwords: 74 | all_stopwords = STOPWORDS.union(set(additional_stopwords)) 75 | else: 76 | all_stopwords = set(additional_stopwords) 77 | 78 | text = filter_stopwords(text, all_stopwords) 79 | 80 | if text: 81 | # Word Cloud dimensions 82 | width = st.sidebar.slider("Select Word Cloud Width", 400, 2000, 1200, 50) 83 | height = st.sidebar.slider("Select Word Cloud Height", 200, 2000, 800, 50) 84 | 85 | # Generate wordcloud 86 | st.subheader("Generated Word Cloud") 87 | fig, ax = plt.subplots(figsize=(width/100, height/100)) # Convert pixels to inches for figsize 88 | wordcloud_img = WordCloud(width=width, height=height, background_color='white', max_words=200, contour_width=3, contour_color='steelblue').generate(text) 89 | ax.imshow(wordcloud_img, interpolation='bilinear') 90 | ax.axis('off') 91 | 92 | # Save plot functionality 93 | format_ = st.selectbox("Select file format to save the plot", ["png", "jpeg", "svg", "pdf"]) 94 | resolution = st.slider("Select Resolution", 100, 500, 300, 50) 95 | # Generate word count table 96 | st.subheader("Word Count Table") 97 | words = text.split() 98 | word_count = pd.DataFrame({'Word': words}).groupby('Word').size().reset_index(name='Count').sort_values('Count', ascending=False) 99 | st.write(word_count) 100 | st.pyplot(fig) 101 | if st.button(f"Save as {format_}"): 102 | buffered = BytesIO() 103 | plt.savefig(buffered, format=format_, dpi=resolution) 104 | st.markdown(get_image_download_link(buffered, format_), unsafe_allow_html=True) 105 | 106 | 107 | # Word count table at the end 108 | st.sidebar.markdown("---") 109 | st.sidebar.subheader("Subscribe to our Youtube Channel to learn Data Science in Urdu/Hindi") 110 | # add a youtube video 111 | st.sidebar.video("https://youtu.be/omk5b1m2h38") 112 | st.sidebar.markdown("---") 113 | # add author name and info 114 | st.sidebar.markdown("Created by: [Dr. Muhammad Aammar Tufail](https://github.com/AammarTufail)") 115 | st.sidebar.markdown("Contact: [Email](mailto:aammar@codanics.com)") 116 | 117 | 118 | st.subheader("Word Count Table") 119 | st.write(word_count) 120 | # Provide download link for table 121 | if st.button('Download Word Count Table as CSV'): 122 | st.markdown(get_table_download_link(word_count, "word_count.csv", "Click Here to Download"), unsafe_allow_html=True) 123 | --------------------------------------------------------------------------------