├── LICENSE
├── README.md
└── Text.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Ken Flerlage
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Text-Analysis
2 | Python script that will break text into words and n-grams. You can then leverage this data in Tableau to create a variety of charts. Some examples shown below.
3 | 
4 | Original Blog: https://www.flerlagetwins.com/2019/09/text-analysis.html
5 | 
6 | ![Text Analysis](https://1.bp.blogspot.com/-YP_eFyw_dyM/XXQ7Q2WQWMI/AAAAAAAAOeo/m8-eZob5fdoC4RXIBM-J5aBui5hjeD3RACEwYBhgL/s1600/00%2BChart%2BMenu.png)
7 | 
8 | 


--------------------------------------------------------------------------------
/Text.py:
--------------------------------------------------------------------------------
  1 | #  This code will read an input csv file and break specified text fields into words and n-grams.
  2 | #  Basic sentiment scores will also be assigned to each word and n-gram.
  3 | #  Stop words will also be identified in the word breakout.
  4 | #  Two csv files (one for words and one for n-grams) will be written.
  5 | #
  6 | #  Written by Ken Flerlage, August, 2019
  7 | #
  8 | #  Note: This code makes use of the Natural Language Toolkit (NLTK) library.
  9 | #  Be sure to download all the nltk data by using the following command: python -m nltk.downloader all 
 10 | #  ...or you can run the following commands to save time and disk space:
 11 | #     python -m nltk.downloader vader_lexicon
 12 | #     python -m nltk.downloader stopwords
 13 | #  To add more stop words, edit the language file in nltk_data\corpora\stopwords
 14 | #
 15 | #  This code uses a number of other libraries which may need to be installed before using.
 16 | #
 17 | #  --------------------------------------------------------------------------------------------------------
 18 | #  Copyright (c) 2019 Kenneth Flerlage
 19 | #  
 20 | #  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 
 21 | #  associated documentation files (the "Software"), to deal in the Software without restriction, including 
 22 | #  without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 23 | #  copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to 
 24 | #  the following conditions: The above copyright notice and this permission notice shall be included in 
 25 | #  all copies or substantial portions of the Software.
 26 | #  
 27 | #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 
 28 | #  LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 
 29 | #  NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
 30 | #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 31 | #  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
 32 | #  --------------------------------------------------------------------------------------------------------
 33 | 
 34 | import csv
 35 | import re
 36 | import math
 37 | import os
 38 | import sys
 39 | from nltk.util import ngrams
 40 | from nltk.corpus import stopwords 
 41 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
 42 | from nltk.stem import SnowballStemmer 
 43 | import PySimpleGUI as sg
 44 | 
 45 | # Get user input.
 46 | stopWordLanguageList = ['arabic','azerbaijani','danish','dutch','english','finnish','french','german','greek','hungarian','indonesian','italian','kazakh','nepali','norwegian','portuguese','romanian','russian','slovene','spanish','swedish','tajik','turkish']
 47 | stopWordLanguage = 'english'
 48 | 
 49 | form = sg.FlexForm('Text Analysis Parameters')  # begin with a blank form
 50 | 
 51 | layout = [
 52 |           [sg.Text('How would you like to parse your text?')],
 53 |           [sg.Text('Full Path of Input File (Use \\\ as Separators)', size=(42, 1)), sg.InputText('C:\\\Your Folder\\\Your File.csv')],
 54 |           [sg.Text('Comma-Separated List of Fields to Analyze (No Spaces)', size=(42, 1)), sg.InputText('Field1,Field2')],
 55 |           [sg.Text('N-Gram Size (Number of Words)', size=(42, 1)), sg.InputText('6')],
 56 |           [sg.Text('Number of Sections', size=(42, 1)), sg.InputText('10')],
 57 |           [sg.Text('Langugage to Use for Stop Words', size=(42, 1)), sg.Combo(values=stopWordLanguageList,default_value='english')],
 58 |           [sg.Submit(), sg.Cancel()]
 59 |          ]
 60 | 
 61 | button, values = form.Layout(layout).Read()
 62 | 
 63 | # Continue if the user did not cancel, did not close the dialog, and entered the right type of value
 64 | if button == "Cancel" or button is None or not(values[2].isdigit()) or not(values[3].isdigit()):
 65 |     sys.exit("You either canceled/closed the dialog or entered an invalid parameter. Exiting the program.") 
 66 | 
 67 | # Read the input
 68 | inputFile = values[0]
 69 | wordFilePath = os.path.dirname(inputFile) + "\\"
 70 | textFields = values[1].split(',')
 71 | stopWordLanguage = values[4]
 72 | numberOfWords = int(values[2])
 73 | numberOfSections = int(values[3])
 74 | 
 75 | # Valid stop word language?
 76 | if not(stopWordLanguage in stopWordLanguageList):
 77 |      sys.exit("Invalid stop word language. Exiting program.")   
 78 | 
 79 | # Check to make sure the input file exists:
 80 | if not(os.path.exists(inputFile)):
 81 |     sys.exit("Input file does not exits. Exiting the program.")
 82 | 
 83 | # Delete any previously written files.
 84 | outFile = wordFilePath + "Words.csv"
 85 | if os.path.exists(outFile):
 86 |     os.remove(outFile) 
 87 | 
 88 | outFile = wordFilePath + "NGrams.csv"
 89 | if os.path.exists(outFile):
 90 |     os.remove(outFile) 
 91 | 
 92 | # Get list of stop words (will be used later)
 93 | stopwords = set(stopwords.words(stopWordLanguage)) 
 94 | 
 95 | stemmer = SnowballStemmer(stopWordLanguage) 
 96 | 
 97 | recordCounter = 0
 98 | 
 99 | # Open the input csv file. Loop through each record and process each field (textFields)
100 | csv.field_size_limit(min(sys.maxsize, 2147483646))
101 | with open(inputFile, mode='r', encoding='utf-8') as csvFile:
102 |     csvReader = csv.DictReader(csvFile)
103 |     lineCount = 0
104 |     for csvRow in csvReader:
105 |         recordID = csvRow["Record ID"]
106 | 
107 |         # Process each text field.
108 |         for textItem in textFields:
109 |             text = csvRow[textItem]
110 | 
111 |             recordCounter = recordCounter + 1
112 |             
113 |             # Text cleanup
114 |             text = text.replace('\n', ' ') 
115 |             text = " " + text.lower() 
116 |             text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # Replace all none alphanumeric characters with spaces
117 | 
118 |             # Break into single words
119 |             tokens = [token for token in text.split(" ") if token != ""]
120 |             output = list(ngrams(tokens, 1))
121 | 
122 |             sectionWordCount = math.ceil(len(output)/numberOfSections)
123 | 
124 |             # Create the sentiment analyzer for some basic sentiment tests.
125 |             nltkSentiment = SentimentIntensityAnalyzer()
126 | 
127 |             # Write single words to csv
128 |             outFile = wordFilePath + "Words.csv"
129 | 
130 |             with open(outFile,'a', newline='', encoding='utf-8') as out:
131 |                 csvOut = csv.writer(out)
132 | 
133 |                 # Write the heading
134 |                 if recordCounter == 1:
135 |                     heading = ('Field','Record ID','Word','Stem','Stop Word','Sentiment','Word Number','Section','Section Word Number')
136 |                     csvOut.writerow(heading)
137 | 
138 |                 wordNumber = 1
139 |                 wordNumberInSection = 1
140 |                 section = 1
141 | 
142 |                 # Write each word
143 |                 for row in output:
144 |                     word = ''.join(row) #  Convert the tuple to a string
145 | 
146 |                     # Get the word's stem.
147 |                     wordStem = stemmer.stem(word)
148 | 
149 |                     # Get the word's sentiment score.
150 |                     score = nltkSentiment.polarity_scores(word)
151 |                     compoundScore = score['compound']
152 | 
153 |                     if word in stopwords:
154 |                         isStopWord = True
155 |                     else:
156 |                         isStopWord = False
157 | 
158 |                     row = (str(textItem),) + (str(recordID),) + row + (wordStem,) + (str(isStopWord),) + (str(compoundScore),) + (str(wordNumber),) + (str(section),) + (str(wordNumberInSection),)
159 |                     csvOut.writerow(row)
160 | 
161 |                     # Update counter and section.
162 |                     if wordNumberInSection % sectionWordCount == 0:
163 |                         section = section + 1
164 |                         wordNumberInSection = 1
165 |                     else:
166 |                         wordNumberInSection = wordNumberInSection + 1
167 | 
168 |                     wordNumber = wordNumber + 1
169 | 
170 |             # Parse into n-grams
171 |             tokens = [token for token in text.split(" ") if token != ""]
172 |             output = list(ngrams(tokens, numberOfWords))
173 | 
174 |             # Write n-grams to csv
175 |             outFile = wordFilePath + "NGrams.csv"
176 | 
177 |             with open(outFile,'a', newline='', encoding='utf-8') as out:
178 |                 csvOut = csv.writer(out)
179 | 
180 |                 # Write the heading
181 |                 if recordCounter == 1:
182 |                     heading = ('Field','Record ID','Word1',)
183 |                     for i in range(2, numberOfWords+1):
184 |                         heading = heading + ('Word' + str(i),)
185 |                     
186 |                     heading = heading + ('Full N-Gram','N-Gram Sentiment','N-Gram Number','Section','Section N-Gram Number')
187 |                     csvOut.writerow(heading)
188 | 
189 |                 wordNumber = 1
190 |                 wordNumberInSection = 1
191 |                 section = 1
192 | 
193 |                 # Write each n-gram
194 |                 fullLine = ''
195 |                 for row in output:
196 |                     fullLine = ' '.join(row) # Build the full string with spaces
197 |                     
198 |                     # Get the n-gram's sentiment score.
199 |                     score = nltkSentiment.polarity_scores(fullLine)
200 |                     compoundScore = score['compound']
201 | 
202 |                     row = (str(textItem),) + (str(recordID),) + row + (fullLine,) + (str(compoundScore),) + (str(wordNumber),) + (str(section),) + (str(wordNumberInSection),)
203 |                     csvOut.writerow(row)
204 | 
205 |                     # Update counter and section.
206 |                     if wordNumberInSection % sectionWordCount == 0:
207 |                         section = section + 1
208 |                         wordNumberInSection = 1
209 |                     else:
210 |                         wordNumberInSection = wordNumberInSection + 1
211 | 
212 |                     wordNumber = wordNumber + 1
213 | 
214 |         lineCount += 1


--------------------------------------------------------------------------------