├── LICENSE ├── README.md └── Text.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Ken Flerlage 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text-Analysis 2 | Python script that will break text into words and n-grams. You can then leverage this data in Tableau to create a variety of charts. Some examples shown below. 3 | 4 | Original Blog: https://www.flerlagetwins.com/2019/09/text-analysis.html 5 | 6 | ![Text Analysis](https://1.bp.blogspot.com/-YP_eFyw_dyM/XXQ7Q2WQWMI/AAAAAAAAOeo/m8-eZob5fdoC4RXIBM-J5aBui5hjeD3RACEwYBhgL/s1600/00%2BChart%2BMenu.png) 7 | 8 | -------------------------------------------------------------------------------- /Text.py: -------------------------------------------------------------------------------- 1 | # This code will read an input csv file and break specified text fields into words and n-grams. 2 | # Basic sentiment scores will also be assigned to each word and n-gram. 3 | # Stop words will also be identified in the word breakout. 4 | # Two csv files (one for words and one for n-grams) will be written. 5 | # 6 | # Written by Ken Flerlage, August, 2019 7 | # 8 | # Note: This code makes use of the Natural Language Toolkit (NLTK) library. 9 | # Be sure to download all the nltk data by using the following command: python -m nltk.downloader all 10 | # ...or you can run the following commands to save time and disk space: 11 | # python -m nltk.downloader vader_lexicon 12 | # python -m nltk.downloader stopwords 13 | # To add more stop words, edit the language file in nltk_data\corpora\stopwords 14 | # 15 | # This code uses a number of other libraries which may need to be installed before using. 16 | # 17 | # -------------------------------------------------------------------------------------------------------- 18 | # Copyright (c) 2019 Kenneth Flerlage 19 | # 20 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 21 | # associated documentation files (the "Software"), to deal in the Software without restriction, including 22 | # without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | # copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to 24 | # the following conditions: The above copyright notice and this permission notice shall be included in 25 | # all copies or substantial portions of the Software. 26 | # 27 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 28 | # LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 29 | # NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 30 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 31 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 32 | # -------------------------------------------------------------------------------------------------------- 33 | 34 | import csv 35 | import re 36 | import math 37 | import os 38 | import sys 39 | from nltk.util import ngrams 40 | from nltk.corpus import stopwords 41 | from nltk.sentiment.vader import SentimentIntensityAnalyzer 42 | from nltk.stem import SnowballStemmer 43 | import PySimpleGUI as sg 44 | 45 | # Get user input. 46 | stopWordLanguageList = ['arabic','azerbaijani','danish','dutch','english','finnish','french','german','greek','hungarian','indonesian','italian','kazakh','nepali','norwegian','portuguese','romanian','russian','slovene','spanish','swedish','tajik','turkish'] 47 | stopWordLanguage = 'english' 48 | 49 | form = sg.FlexForm('Text Analysis Parameters') # begin with a blank form 50 | 51 | layout = [ 52 | [sg.Text('How would you like to parse your text?')], 53 | [sg.Text('Full Path of Input File (Use \\\ as Separators)', size=(42, 1)), sg.InputText('C:\\\Your Folder\\\Your File.csv')], 54 | [sg.Text('Comma-Separated List of Fields to Analyze (No Spaces)', size=(42, 1)), sg.InputText('Field1,Field2')], 55 | [sg.Text('N-Gram Size (Number of Words)', size=(42, 1)), sg.InputText('6')], 56 | [sg.Text('Number of Sections', size=(42, 1)), sg.InputText('10')], 57 | [sg.Text('Langugage to Use for Stop Words', size=(42, 1)), sg.Combo(values=stopWordLanguageList,default_value='english')], 58 | [sg.Submit(), sg.Cancel()] 59 | ] 60 | 61 | button, values = form.Layout(layout).Read() 62 | 63 | # Continue if the user did not cancel, did not close the dialog, and entered the right type of value 64 | if button == "Cancel" or button is None or not(values[2].isdigit()) or not(values[3].isdigit()): 65 | sys.exit("You either canceled/closed the dialog or entered an invalid parameter. Exiting the program.") 66 | 67 | # Read the input 68 | inputFile = values[0] 69 | wordFilePath = os.path.dirname(inputFile) + "\\" 70 | textFields = values[1].split(',') 71 | stopWordLanguage = values[4] 72 | numberOfWords = int(values[2]) 73 | numberOfSections = int(values[3]) 74 | 75 | # Valid stop word language? 76 | if not(stopWordLanguage in stopWordLanguageList): 77 | sys.exit("Invalid stop word language. Exiting program.") 78 | 79 | # Check to make sure the input file exists: 80 | if not(os.path.exists(inputFile)): 81 | sys.exit("Input file does not exits. Exiting the program.") 82 | 83 | # Delete any previously written files. 84 | outFile = wordFilePath + "Words.csv" 85 | if os.path.exists(outFile): 86 | os.remove(outFile) 87 | 88 | outFile = wordFilePath + "NGrams.csv" 89 | if os.path.exists(outFile): 90 | os.remove(outFile) 91 | 92 | # Get list of stop words (will be used later) 93 | stopwords = set(stopwords.words(stopWordLanguage)) 94 | 95 | stemmer = SnowballStemmer(stopWordLanguage) 96 | 97 | recordCounter = 0 98 | 99 | # Open the input csv file. Loop through each record and process each field (textFields) 100 | csv.field_size_limit(min(sys.maxsize, 2147483646)) 101 | with open(inputFile, mode='r', encoding='utf-8') as csvFile: 102 | csvReader = csv.DictReader(csvFile) 103 | lineCount = 0 104 | for csvRow in csvReader: 105 | recordID = csvRow["Record ID"] 106 | 107 | # Process each text field. 108 | for textItem in textFields: 109 | text = csvRow[textItem] 110 | 111 | recordCounter = recordCounter + 1 112 | 113 | # Text cleanup 114 | text = text.replace('\n', ' ') 115 | text = " " + text.lower() 116 | text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # Replace all none alphanumeric characters with spaces 117 | 118 | # Break into single words 119 | tokens = [token for token in text.split(" ") if token != ""] 120 | output = list(ngrams(tokens, 1)) 121 | 122 | sectionWordCount = math.ceil(len(output)/numberOfSections) 123 | 124 | # Create the sentiment analyzer for some basic sentiment tests. 125 | nltkSentiment = SentimentIntensityAnalyzer() 126 | 127 | # Write single words to csv 128 | outFile = wordFilePath + "Words.csv" 129 | 130 | with open(outFile,'a', newline='', encoding='utf-8') as out: 131 | csvOut = csv.writer(out) 132 | 133 | # Write the heading 134 | if recordCounter == 1: 135 | heading = ('Field','Record ID','Word','Stem','Stop Word','Sentiment','Word Number','Section','Section Word Number') 136 | csvOut.writerow(heading) 137 | 138 | wordNumber = 1 139 | wordNumberInSection = 1 140 | section = 1 141 | 142 | # Write each word 143 | for row in output: 144 | word = ''.join(row) # Convert the tuple to a string 145 | 146 | # Get the word's stem. 147 | wordStem = stemmer.stem(word) 148 | 149 | # Get the word's sentiment score. 150 | score = nltkSentiment.polarity_scores(word) 151 | compoundScore = score['compound'] 152 | 153 | if word in stopwords: 154 | isStopWord = True 155 | else: 156 | isStopWord = False 157 | 158 | row = (str(textItem),) + (str(recordID),) + row + (wordStem,) + (str(isStopWord),) + (str(compoundScore),) + (str(wordNumber),) + (str(section),) + (str(wordNumberInSection),) 159 | csvOut.writerow(row) 160 | 161 | # Update counter and section. 162 | if wordNumberInSection % sectionWordCount == 0: 163 | section = section + 1 164 | wordNumberInSection = 1 165 | else: 166 | wordNumberInSection = wordNumberInSection + 1 167 | 168 | wordNumber = wordNumber + 1 169 | 170 | # Parse into n-grams 171 | tokens = [token for token in text.split(" ") if token != ""] 172 | output = list(ngrams(tokens, numberOfWords)) 173 | 174 | # Write n-grams to csv 175 | outFile = wordFilePath + "NGrams.csv" 176 | 177 | with open(outFile,'a', newline='', encoding='utf-8') as out: 178 | csvOut = csv.writer(out) 179 | 180 | # Write the heading 181 | if recordCounter == 1: 182 | heading = ('Field','Record ID','Word1',) 183 | for i in range(2, numberOfWords+1): 184 | heading = heading + ('Word' + str(i),) 185 | 186 | heading = heading + ('Full N-Gram','N-Gram Sentiment','N-Gram Number','Section','Section N-Gram Number') 187 | csvOut.writerow(heading) 188 | 189 | wordNumber = 1 190 | wordNumberInSection = 1 191 | section = 1 192 | 193 | # Write each n-gram 194 | fullLine = '' 195 | for row in output: 196 | fullLine = ' '.join(row) # Build the full string with spaces 197 | 198 | # Get the n-gram's sentiment score. 199 | score = nltkSentiment.polarity_scores(fullLine) 200 | compoundScore = score['compound'] 201 | 202 | row = (str(textItem),) + (str(recordID),) + row + (fullLine,) + (str(compoundScore),) + (str(wordNumber),) + (str(section),) + (str(wordNumberInSection),) 203 | csvOut.writerow(row) 204 | 205 | # Update counter and section. 206 | if wordNumberInSection % sectionWordCount == 0: 207 | section = section + 1 208 | wordNumberInSection = 1 209 | else: 210 | wordNumberInSection = wordNumberInSection + 1 211 | 212 | wordNumber = wordNumber + 1 213 | 214 | lineCount += 1 --------------------------------------------------------------------------------