├── .gitignore ├── README.md ├── text_clustering.ipynb └── text_clustering.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Clustering Implementation 2 | Implementation of text clustering using fastText word embedding and K-means algorithm. The dataset can be accessed via **[Kaggle](https://www.kaggle.com/dodyagung/accident).** 3 | 4 | Texts are everywhere, with social media as one of its biggest generators. People are constantly sharing them on many platforms. Rather than letting it be as it is, we can process them into something useful using text mining methods. One famous application of text mining is sentiment analysis where we can identify whether a text's opinion is positive, negative, or neutral. But here, we'll talk about another way of doing it: text clustering. For more detailed and comprehensive explanation, you can read it **[here](https://towardsdatascience.com/making-sense-of-text-clustering-ca649c190b20).** 5 | 6 | ## Steps to implement text clustering 7 | 8 | **1. Load dependencies** 9 | ``` 10 | import pandas as pd 11 | import numpy as np 12 | 13 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 14 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 15 | 16 | import fasttext 17 | 18 | from sklearn.cluster import KMeans 19 | from sklearn.decomposition import PCA 20 | 21 | import matplotlib.pyplot as plt 22 | from mpl_toolkits.mplot3d import Axes3D 23 | ``` 24 | 25 | **2. Load and preprocess dataset** 26 | ``` 27 | def text_preprocess(series, stemmer, stopwords): 28 | df = series.str.replace("\n\t", " ") 29 | df = df.str.replace(r"[^a-zA-Z ]+", "") 30 | df = df.str.lower() 31 | df = df.apply(lambda x: ' '.join([stemmer.stem(item) for item in x.split() if item not in stopwords])) 32 | return df 33 | 34 | # Download first from Kaggle 35 | data = pd.read_csv('twitter_label_manual.csv') 36 | 37 | # Get stopwords and create stemmer using Sastrawi 38 | stopwords = StopWordRemoverFactory().get_stop_words() 39 | stemmer = StemmerFactory().create_stemmer() 40 | 41 | # Preprocess the sentences 42 | data['processed_text'] = text_preprocess(data['full_text'], stemmer, stopwords) 43 | ``` 44 | 45 | **3. Train word embedding model with fastText** 46 | ``` 47 | # Build word embedding model and create one more with dim=3 for experimentation 48 | model = fasttext.train_unsupervised('twitter.txt', model='skipgram', dim=100) 49 | 50 | # Apply the word embedding model to the sentences 51 | data['vec'] = data['processed_text'].apply(lambda x: model.get_sentence_vector(x)) 52 | ``` 53 | 54 | **4. Apply KMeans clustering algorithm** 55 | ``` 56 | # Elbow Method to define number of k for the clustering 57 | sum_of_squared_distances = [] 58 | K = range(1,10) 59 | for k in K: 60 | km = KMeans(n_clusters=k) 61 | km = km.fit(pd.DataFrame(data['vec'].values.tolist())) 62 | sum_of_squared_distances.append(km.inertia_) 63 | 64 | # Plot it 65 | plt.plot(K, Sum_of_squared_distances, 'bx-') 66 | plt.xlabel('k') 67 | plt.ylabel('Sum_of_squared_distances') 68 | plt.title('Elbow Method For Optimal k') 69 | plt.show() 70 | 71 | 72 | # K Means clustering 73 | kmeans = KMeans(n_clusters=3) 74 | kmeans.fit(data['vec'].values.tolist()) 75 | 76 | # Fit and predict the cluster 77 | data['cluster'] = kmeans.fit_predict(data['vec'].values.tolist()) 78 | ``` 79 | 80 | **5. Plot and see results** 81 | ``` 82 | # Use PCA to reduce the dimensions 83 | pca = PCA(n_components=3) 84 | data['x'] = pca.fit_transform(data['vec'].values.tolist())[:,0] 85 | data['y'] = pca.fit_transform(data['vec'].values.tolist())[:,1] 86 | data['z'] = pca.fit_transform(data['vec'].values.tolist())[:,2] 87 | 88 | # Plot in 2D 89 | plt.scatter(data['y'],data['x'], c=data['cluster'], cmap='rainbow') 90 | 91 | # Plot in 3D 92 | fig = plt.figure(1, figsize=(10,10)) 93 | ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) 94 | ax.scatter(data['x'],data['y'],data['z'], c=data['cluster'], cmap='rainbow') 95 | ax.set_xlabel("x") 96 | ax.set_ylabel("y") 97 | ax.set_zlabel("z") 98 | ax.set_facecolor('white') 99 | plt.title("Tweet Clustering using K Means", fontsize=14) 100 | 101 | # Count flag on each cluster 102 | data.groupby(['cluster'])['is_accident'].value_counts() 103 | ``` 104 | 105 | Also, you can check the output of each step in the Jupyter Notebook file. 106 | -------------------------------------------------------------------------------- /text_clustering.py: -------------------------------------------------------------------------------- 1 | # Author: Ignasius Harvey 2 | # Date: 30 June, 2020 3 | # Description: Implementation of text clustering using fastText word embedding and k-means algorithm. 4 | # Reference: Saputro, D. A., & Girsang, A. S. (2020). Classification of Traffic Accident Information Using Machine Learning from Social Media. International Journal of Emerging Trends in Engineering Research, 8(3), 630–637. https://doi.org/10.30534/ijeter/2020/04832020 5 | # Dataset: https://www.kaggle.com/dodyagung/accident 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 11 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 12 | 13 | import fasttext 14 | 15 | from sklearn.cluster import KMeans 16 | from sklearn.decomposition import PCA 17 | 18 | import matplotlib.pyplot as plt 19 | from mpl_toolkits.mplot3d import Axes3D 20 | 21 | def text_preprocess(series, stemmer, stopwords): 22 | df = series.str.replace("\n\t", " ") 23 | df = df.str.replace(r"[^a-zA-Z ]+", "") 24 | df = df.str.lower() 25 | df = df.apply(lambda x: ' '.join([stemmer.stem(item) for item in x.split() if item not in stopwords])) 26 | return df 27 | 28 | ###### Load dataset and preprocess ###### 29 | 30 | # Download first from Kaggle 31 | data = pd.read_csv('twitter_label_manual.csv') 32 | 33 | # Get stopwords and create stemmer using Sastrawi 34 | stopwords = StopWordRemoverFactory().get_stop_words() 35 | stemmer = StemmerFactory().create_stemmer() 36 | 37 | # Preprocess the sentences 38 | data['processed_text'] = text_preprocess(data['full_text'], stemmer, stopwords) 39 | 40 | 41 | ###### Train word embedding model ###### 42 | 43 | # Build word embedding model and create one more with dim=3 for experimentation 44 | model = fasttext.train_unsupervised('twitter.txt', model='skipgram', dim=100) 45 | # model_3 = fasttext.train_unsupervised('twitter.txt', model='skipgram', dim=3) 46 | 47 | # Apply the word embedding model to the sentences 48 | data['vec'] = data['processed_text'].apply(lambda x: model.get_sentence_vector(x)) 49 | # data['vec'] = data['processed_text'].apply(lambda x: model_3.get_sentence_vector(x)) 50 | 51 | 52 | ###### CLustering Attempt ###### 53 | 54 | # Elbow Method to define number of k for the clustering 55 | sum_of_squared_distances = [] 56 | K = range(1,10) 57 | for k in K: 58 | km = KMeans(n_clusters=k) 59 | km = km.fit(pd.DataFrame(data['vec'].values.tolist())) 60 | sum_of_squared_distances.append(km.inertia_) 61 | 62 | # Plot it 63 | plt.plot(K, Sum_of_squared_distances, 'bx-') 64 | plt.xlabel('k') 65 | plt.ylabel('Sum_of_squared_distances') 66 | plt.title('Elbow Method For Optimal k') 67 | plt.show() 68 | 69 | 70 | # K Means clustering 71 | kmeans = KMeans(n_clusters=3) 72 | kmeans.fit(data['vec'].values.tolist()) 73 | 74 | # Fit and predict the cluster 75 | data['cluster'] = kmeans.fit_predict(data['vec'].values.tolist()) 76 | 77 | 78 | # Use PCA to reduce the dimensions 79 | pca = PCA(n_components=3) 80 | data['x'] = pca.fit_transform(data['vec'].values.tolist())[:,0] 81 | data['y'] = pca.fit_transform(data['vec'].values.tolist())[:,1] 82 | data['z'] = pca.fit_transform(data['vec'].values.tolist())[:,2] 83 | 84 | # Plot in 2D 85 | plt.scatter(data['y'],data['x'], c=data['cluster'], cmap='rainbow') 86 | 87 | # Plot in 3D 88 | fig = plt.figure(1, figsize=(10,10)) 89 | ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) 90 | ax.scatter(data['x'],data['y'],data['z'], c=data['cluster'], cmap='rainbow') 91 | ax.set_xlabel("x") 92 | ax.set_ylabel("y") 93 | ax.set_zlabel("z") 94 | ax.set_facecolor('white') 95 | plt.title("Tweet Clustering using K Means", fontsize=14) 96 | 97 | # Count flag on each cluster 98 | data.groupby(['cluster'])['is_accident'].value_counts() --------------------------------------------------------------------------------