├── .gitignore
├── README.md
├── text_clustering.ipynb
└── text_clustering.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text Clustering Implementation
  2 | Implementation of text clustering using fastText word embedding and K-means algorithm. The dataset can be accessed via **[Kaggle](https://www.kaggle.com/dodyagung/accident).**
  3 | 
  4 | Texts are everywhere, with social media as one of its biggest generators. People are constantly sharing them on many platforms. Rather than letting it be as it is, we can process them into something useful using text mining methods. One famous application of text mining is sentiment analysis where we can identify whether a text's opinion is positive, negative, or neutral. But here, we'll talk about another way of doing it: text clustering. For more detailed and comprehensive explanation, you can read it **[here](https://towardsdatascience.com/making-sense-of-text-clustering-ca649c190b20).**
  5 | 
  6 | ## Steps to implement text clustering
  7 | 
  8 | **1. Load dependencies**
  9 | ```
 10 | import pandas as pd
 11 | import numpy as np
 12 | 
 13 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
 14 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
 15 | 
 16 | import fasttext
 17 | 
 18 | from sklearn.cluster import KMeans 
 19 | from sklearn.decomposition import PCA
 20 | 
 21 | import matplotlib.pyplot as plt 
 22 | from mpl_toolkits.mplot3d import Axes3D
 23 | ```
 24 | 
 25 | **2. Load and preprocess dataset**
 26 | ```
 27 | def text_preprocess(series, stemmer, stopwords):
 28 |     df = series.str.replace("\n\t",  " ")
 29 |     df = df.str.replace(r"[^a-zA-Z ]+", "")
 30 |     df = df.str.lower()
 31 |     df = df.apply(lambda x: ' '.join([stemmer.stem(item) for item in x.split() if item not in stopwords])) 
 32 |     return df
 33 |     
 34 | # Download first from Kaggle    
 35 | data = pd.read_csv('twitter_label_manual.csv')
 36 | 
 37 | # Get stopwords and create stemmer using Sastrawi
 38 | stopwords = StopWordRemoverFactory().get_stop_words()
 39 | stemmer = StemmerFactory().create_stemmer()
 40 | 
 41 | # Preprocess the sentences
 42 | data['processed_text'] = text_preprocess(data['full_text'], stemmer, stopwords)
 43 | ```
 44 | 
 45 | **3. Train word embedding model with fastText**
 46 | ```
 47 | # Build word embedding model and create one more with dim=3 for experimentation
 48 | model = fasttext.train_unsupervised('twitter.txt', model='skipgram', dim=100)
 49 | 
 50 | # Apply the word embedding model to the sentences
 51 | data['vec'] = data['processed_text'].apply(lambda x: model.get_sentence_vector(x))
 52 | ```
 53 | 
 54 | **4. Apply KMeans clustering algorithm**
 55 | ```
 56 | # Elbow Method to define number of k for the clustering
 57 | sum_of_squared_distances = []
 58 | K = range(1,10)
 59 | for k in K:
 60 |     km = KMeans(n_clusters=k)
 61 |     km = km.fit(pd.DataFrame(data['vec'].values.tolist()))
 62 |     sum_of_squared_distances.append(km.inertia_)
 63 | 
 64 | # Plot it
 65 | plt.plot(K, Sum_of_squared_distances, 'bx-')
 66 | plt.xlabel('k')
 67 | plt.ylabel('Sum_of_squared_distances')
 68 | plt.title('Elbow Method For Optimal k')
 69 | plt.show()
 70 | 
 71 | 
 72 | # K Means clustering
 73 | kmeans = KMeans(n_clusters=3)  
 74 | kmeans.fit(data['vec'].values.tolist())
 75 | 
 76 | # Fit and predict the cluster
 77 | data['cluster'] = kmeans.fit_predict(data['vec'].values.tolist())
 78 | ```
 79 | 
 80 | **5. Plot and see results**
 81 | ```
 82 | # Use PCA to reduce the dimensions
 83 | pca = PCA(n_components=3)
 84 | data['x'] = pca.fit_transform(data['vec'].values.tolist())[:,0]
 85 | data['y'] = pca.fit_transform(data['vec'].values.tolist())[:,1]
 86 | data['z'] = pca.fit_transform(data['vec'].values.tolist())[:,2]
 87 | 
 88 | # Plot in 2D
 89 | plt.scatter(data['y'],data['x'], c=data['cluster'], cmap='rainbow')
 90 | 
 91 | # Plot in 3D
 92 | fig = plt.figure(1, figsize=(10,10))
 93 | ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
 94 | ax.scatter(data['x'],data['y'],data['z'], c=data['cluster'], cmap='rainbow')
 95 | ax.set_xlabel("x")
 96 | ax.set_ylabel("y")
 97 | ax.set_zlabel("z")
 98 | ax.set_facecolor('white')
 99 | plt.title("Tweet Clustering using K Means", fontsize=14)
100 | 
101 | # Count flag on each cluster
102 | data.groupby(['cluster'])['is_accident'].value_counts()
103 | ```
104 | 
105 | Also, you can check the output of each step in the Jupyter Notebook file.
106 | 


--------------------------------------------------------------------------------
/text_clustering.py:
--------------------------------------------------------------------------------
 1 | # Author: Ignasius Harvey
 2 | # Date: 30 June, 2020
 3 | # Description: Implementation of text clustering using fastText word embedding and k-means algorithm.
 4 | # Reference: Saputro, D. A., & Girsang, A. S. (2020). Classification of Traffic Accident Information Using Machine Learning from Social Media. International Journal of Emerging Trends in Engineering Research, 8(3), 630–637. https://doi.org/10.30534/ijeter/2020/04832020
 5 | # Dataset: https://www.kaggle.com/dodyagung/accident
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
11 | from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
12 | 
13 | import fasttext
14 | 
15 | from sklearn.cluster import KMeans 
16 | from sklearn.decomposition import PCA
17 | 
18 | import matplotlib.pyplot as plt 
19 | from mpl_toolkits.mplot3d import Axes3D
20 | 
21 | def text_preprocess(series, stemmer, stopwords):
22 |     df = series.str.replace("\n\t",  " ")
23 |     df = df.str.replace(r"[^a-zA-Z ]+", "")
24 |     df = df.str.lower()
25 |     df = df.apply(lambda x: ' '.join([stemmer.stem(item) for item in x.split() if item not in stopwords])) 
26 |     return df
27 | 
28 | ###### Load dataset and preprocess ######
29 | 
30 | # Download first from Kaggle
31 | data = pd.read_csv('twitter_label_manual.csv')
32 | 
33 | # Get stopwords and create stemmer using Sastrawi
34 | stopwords = StopWordRemoverFactory().get_stop_words()
35 | stemmer = StemmerFactory().create_stemmer()
36 | 
37 | # Preprocess the sentences
38 | data['processed_text'] = text_preprocess(data['full_text'], stemmer, stopwords)
39 | 
40 | 
41 | ###### Train word embedding model ######
42 | 
43 | # Build word embedding model and create one more with dim=3 for experimentation
44 | model = fasttext.train_unsupervised('twitter.txt', model='skipgram', dim=100)
45 | # model_3 = fasttext.train_unsupervised('twitter.txt', model='skipgram', dim=3)
46 | 
47 | # Apply the word embedding model to the sentences
48 | data['vec'] = data['processed_text'].apply(lambda x: model.get_sentence_vector(x))
49 | # data['vec'] = data['processed_text'].apply(lambda x: model_3.get_sentence_vector(x))
50 | 
51 | 
52 | ###### CLustering Attempt ######
53 | 
54 | # Elbow Method to define number of k for the clustering
55 | sum_of_squared_distances = []
56 | K = range(1,10)
57 | for k in K:
58 |     km = KMeans(n_clusters=k)
59 |     km = km.fit(pd.DataFrame(data['vec'].values.tolist()))
60 |     sum_of_squared_distances.append(km.inertia_)
61 | 
62 | # Plot it
63 | plt.plot(K, Sum_of_squared_distances, 'bx-')
64 | plt.xlabel('k')
65 | plt.ylabel('Sum_of_squared_distances')
66 | plt.title('Elbow Method For Optimal k')
67 | plt.show()
68 | 
69 | 
70 | # K Means clustering
71 | kmeans = KMeans(n_clusters=3)  
72 | kmeans.fit(data['vec'].values.tolist())
73 | 
74 | # Fit and predict the cluster
75 | data['cluster'] = kmeans.fit_predict(data['vec'].values.tolist())
76 | 
77 | 
78 | # Use PCA to reduce the dimensions
79 | pca = PCA(n_components=3)
80 | data['x'] = pca.fit_transform(data['vec'].values.tolist())[:,0]
81 | data['y'] = pca.fit_transform(data['vec'].values.tolist())[:,1]
82 | data['z'] = pca.fit_transform(data['vec'].values.tolist())[:,2]
83 | 
84 | # Plot in 2D
85 | plt.scatter(data['y'],data['x'], c=data['cluster'], cmap='rainbow')
86 | 
87 | # Plot in 3D
88 | fig = plt.figure(1, figsize=(10,10))
89 | ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
90 | ax.scatter(data['x'],data['y'],data['z'], c=data['cluster'], cmap='rainbow')
91 | ax.set_xlabel("x")
92 | ax.set_ylabel("y")
93 | ax.set_zlabel("z")
94 | ax.set_facecolor('white')
95 | plt.title("Tweet Clustering using K Means", fontsize=14)
96 | 
97 | # Count flag on each cluster
98 | data.groupby(['cluster'])['is_accident'].value_counts()


--------------------------------------------------------------------------------