├── index.txt ├── removeDups_fortest.py ├── LICENSE.txt ├── refine.py ├── censusAmericansBot.py ├── README.md ├── draft.py └── draftTweet_byState.py /index.txt: -------------------------------------------------------------------------------- 1 | 0 -------------------------------------------------------------------------------- /removeDups_fortest.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import random 3 | 4 | def removeDups(): 5 | with open("data/tweets.csv",'rb') as csvfile: 6 | spamreader = csv.reader(csvfile) 7 | # headerDictionary = replaceHeaderCodes() 8 | text = [] 9 | dups = [] 10 | for row in spamreader: 11 | #print row 12 | if row in text or len(row)>140: 13 | dups.append(row) 14 | else: 15 | text.append(row) 16 | print text 17 | print len(dups) 18 | removeDups() 19 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Allison Parrish 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /refine.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import random 3 | states = ['ak','usb','usc','usd'] 4 | currentFile = 0 5 | fileRoot = 'data/ss13p' 6 | priority1 = [] 7 | priority2 = [] 8 | priority3 = [] 9 | def reduceDataByColumn(infile,outfile): 10 | existingTweets = [] 11 | with open(outfile,'wb') as outputFile: 12 | spamwriter = csv.writer(outputFile) 13 | with open(infile, 'rb') as csvfile: 14 | spamreader = csv.reader(csvfile) 15 | # headerDictionary = replaceHeaderCodes() 16 | for row in spamreader: 17 | newRow = [] 18 | newRowText = "" 19 | for column in row: 20 | if len(column)>6: 21 | newRow.append(column) 22 | newRowText+=column 23 | shortTweet = "" 24 | rowLength = len(newRow) 25 | sampling = random.sample(range(1, len(newRow)), rowLength-1) 26 | for i in sampling: 27 | shortTweet += str(newRow[i]) 28 | 29 | while len(newRowText) > 140 and rowLength > 3: 30 | sampling = random.sample(range(1, len(newRow)), rowLength-1) 31 | rowLength = rowLength-1 32 | #print sampling 33 | shortTweet = "" 34 | for i in sampling: 35 | shortTweet += str(newRow[i]) 36 | #print len(shortTweet) 37 | #print newLine 38 | #print shortTweet 39 | if shortTweet in existingTweets: 40 | print "repeat" 41 | else: 42 | spamwriter.writerow([shortTweet]) 43 | 44 | infile = fileRoot+states[currentFile]+"_filledin.csv" 45 | outfile = fileRoot+states[currentFile]+"_refined.csv" 46 | reduceDataByColumn(infile,outfile) 47 | 48 | maxLength = 31 49 | def findCompleteRow(infile): 50 | maxLength = 0 51 | with open(infile, 'rb') as csvfile: 52 | spamreader = csv.reader(csvfile) 53 | # headerDictionary = replaceHeaderCodes() 54 | for row in spamreader: 55 | rowLength = len(row) 56 | for column in row: 57 | if column == "": 58 | rowLength = rowLength-1 59 | if rowLength > 30: 60 | print row 61 | print maxLength 62 | 63 | #findCompleteRow(infile) -------------------------------------------------------------------------------- /censusAmericansBot.py: -------------------------------------------------------------------------------- 1 | #code - everywordbot by allison parrish 2 | import tweepy 3 | import os 4 | import time 5 | 6 | class EverywordBot(object): 7 | 8 | def __init__(self, consumer_key, consumer_secret, 9 | access_token, token_secret, 10 | source_file_name, index_file_name, 11 | lat=None, long=None, place_id=None): 12 | self.source_file_name = source_file_name 13 | self.index_file_name = index_file_name 14 | self.lat = lat 15 | self.long = long 16 | self.place_id = place_id 17 | 18 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 19 | auth.set_access_token(access_token, token_secret) 20 | self.twitter = tweepy.API(auth) 21 | 22 | def _get_current_index(self): 23 | if not(os.path.isfile(self.index_file_name)): 24 | return 0 25 | with open(self.index_file_name) as index_fh: 26 | return int(index_fh.read().strip()) 27 | 28 | def _increment_index(self, index): 29 | with open(self.index_file_name, "w") as index_fh: 30 | index_fh.truncate() 31 | index_fh.write("%d" % (index + 1)) 32 | index_fh.close() 33 | 34 | def _get_current_line(self, index): 35 | with open(self.source_file_name) as source_fh: 36 | # read the desired line 37 | for i, status_str in enumerate(source_fh): 38 | if i == index: 39 | break 40 | return status_str.strip() 41 | 42 | def post(self): 43 | index = self._get_current_index() 44 | status_str = self._get_current_line(index) 45 | self.twitter.update_status(status=status_str, 46 | lat=self.lat, long=self.long, 47 | place_id=self.place_id) 48 | self._increment_index(index) 49 | 50 | if __name__ == '__main__': 51 | from optparse import OptionParser 52 | parser = OptionParser() 53 | parser.add_option('--consumer_key', dest='consumer_key', 54 | help="twitter consumer key") 55 | parser.add_option('--consumer_secret', dest='consumer_secret', 56 | help="twitter consumer secret") 57 | parser.add_option('--access_token', dest='access_token', 58 | help="twitter token key") 59 | parser.add_option('--token_secret', dest='token_secret', 60 | help="twitter token secret") 61 | parser.add_option('--source_file', dest='source_file', 62 | default="tweet_list.txt", 63 | help="source file (one line per tweet)") 64 | parser.add_option('--index_file', dest='index_file', 65 | default="index", 66 | help="index file (must be able to write to this file)") 67 | parser.add_option('--lat', dest='lat', 68 | help="The latitude for tweets") 69 | parser.add_option('--long', dest='long', 70 | help="The longitude for tweets") 71 | parser.add_option('--place_id', dest='place_id', 72 | help="Twitter ID of location for tweets") 73 | (options, args) = parser.parse_args() 74 | 75 | bot = EverywordBot(options.consumer_key, options.consumer_secret, 76 | options.access_token, options.token_secret, 77 | options.source_file, options.index_file, 78 | options.lat, options.long, options.place_id) 79 | numberOfTweets = 0 80 | while True: 81 | numberOfTweets +=1 82 | bot.post() 83 | print "tweeeeet ..."+str(numberOfTweets)+ " ... %s" % time.ctime() 84 | time.sleep(14400) 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # censusAmericans 2 | Here is a twitter robot that automatically posts short bios of Americans from census data to twitter. 3 | 4 | The data is public and anonymized 5 | censusAmericans takes Public Use Microdata Sample(PUMS) data and reconstitutes it into mini narratives that describe real individuals who participated in the extended census in 2013. The PUMS is a limited subset of the American Community Survey, which is released to allow researchers access to a number of detailed profiles of anonymized individuals from each state. The profiles include items that when assembled has the potential to describe individuals for further study, but not so much detail that they can be deanonymized. For example while including relatable details such as the length, method, and time of a person's daily commute to work, the snapshots presented are also limited by omissions such as the lack of a person's location. 6 | 7 | We made them into bios because what limited information is offered seemed to communicate individuals effectively - thankfully, we only need to know a little about a person in order to relate to them. 8 | whether it is how much they work, who they take care of, or where they were born, just a few descriptors are enough. we hope some of these qualities are perserved even when we further limit the reconstituted bios of these americans to the length of a tweet. we built the twitter account to generate these bios efficiently and automatically broadcast them every few hours until every person in the data has been covered. even though the limitation on length will result in similar as well as less satisfying bios at times (we ourselves would find it limiting to be described in such spare terms and categorized so broadly), it remains interesting to think about these people because they are real and when they might even shorten distances when they are broadcast ambiently and constantly. 9 | 10 | here are some people: 11 | - "I've been married a few times. I work in sporting and athletic goods, and doll industry. I've never served in the military." 12 | - "I was naturalized as an U. S. citizen. I had less than 2 weeks off last year. I work in construction." 13 | - "I live with my parents. I'm unemployed, have not worked for the last 5 years. I've not worked for at least 5 years." 14 | - "I've been taking care of my grandkids for more than 5 years. I work in amusement, gambling, and recreation industries." 15 | 16 | you can follow the census here: @censusAmericans 17 | 18 | Data: 19 | - American Community Survey's Public Use Microdata Sample (PUMS) dataset 20 | - from http://www.census.gov/acs/www/data_documentation/public_use_microdata_sample/ 21 | 22 | Code: 23 | 24 | The data is processed in 3 steps, so the code here split into 3 python scripts. It is by no means efficient. 25 | 26 | - draft.py isolates columns with content(just taking out identification codes, redundant columns) and turns the raw data from above into human readable form using dictionaries created to make each line of data more conversational sounding. Example: column JWTR with value 06 is translated into "I take a ferryboat to work. " This results in a very large file of "bios" that you can read for a sanity check. 27 | 28 | - refine.py checks each entry from the previous script and randomly combines tweets from 3-4 sentences/columns until each entry is less than 140 chars and ok for tweet. 29 | 30 | - censusAmericansBot.py uses tweepy the python twitter api to post a row from the resulting file. Currently it posts 1 line every 4 hours, and keeps a index file to track its progress. Based on everywordbot code from Allison Parrish NYU - her projects: http://www.decontextualize.com/ 31 | 32 | - Setting up a twitter account and app to run the script is simple, I followed the instructions here: http://zachwhalen.net/posts/how-to-make-a-twitter-bot-with-google-spreadsheets-version-04 33 | 34 | TODO: 35 | - make notification email for when script fails 36 | - make geolocated tweets according to state? 37 | -------------------------------------------------------------------------------- /draft.py: -------------------------------------------------------------------------------- 1 | import csv 2 | #all headers 3 | #def dataHeaders(): 4 | # with open('alabama.csv', 'rb') as csvfile: 5 | # spamreader = csv.reader(csvfile) 6 | # for row in spamreader: 7 | # return row 8 | # 9 | #headers in current data 10 | #headers_long = ["ST","CIT","CITWP05","CITWP12","COW","DEAR","DEYE","DOUT","DPHY","DREM","ENG","FER","GCL","GCM","GCR","HINS1","HINS2","HINS3","HINS4","HINS5","HINS7","JWMNP","JWRIP","JWTR","LANX","MAR","MARHD","MARHM","MARHT","MARHW","MARHYP05","MARHYP05","MIG","MIL","MLPA","MLPB","MLPCD","MLPE","MLPH","MLPJ","NWLA","NWLK","OIP","PAP","RELP","RETP","SCH","SCHG","SCHL","SEMP","SEX","WKHP","WKL","WKW","YOEP05","YOEP12","ANC","ANC1P05","ANC1P12","ESP","FOD1P","FOD2P","HICOV","INDP","JWAP","JWDP","LANP05","MSP"] 11 | headers = ["JWMNP","CITWP05","MARHYP05","WKHP","WAGP","JWDP","ANC","ST","ANC1P12","FER","JWAP","MARHT","NWLA","CIT","FOD1P","JWRIP","MARHW","NWLK","COW","GCL","JWTR","MIG","SCHL","DEAR","GCM","LANP05","DEYE","HICOV","MAR","MIL","VPS","ENG","HISP","MARHD","NATIVITY","WAOB","ESP","INDP","MARHM","NOP","WKW"] 12 | def dataHeaders(): 13 | with open('alabama.csv', 'rb') as csvfile: 14 | spamreader = csv.reader(csvfile) 15 | for row in spamreader: 16 | return row 17 | 18 | #get headers in use index in all headers 19 | def getHeaderIndex(): 20 | headerDictionary = {} 21 | indexList = [] 22 | for header in headers: 23 | #print header 24 | headerIndex = dataHeaders().index(header) 25 | headerDictionary[header]=headerIndex 26 | indexList.append(headerIndex) 27 | #return headerDictionary 28 | return indexList 29 | 30 | def reduceDataByColumn(infile,outfile): 31 | print "reduce to useful columns ..." 32 | indexList = getHeaderIndex() 33 | reducedRowsList = [] 34 | with open(outfile,'wb') as outputFile: 35 | spamwriter = csv.writer(outputFile) 36 | with open(infile, 'rb') as csvfile: 37 | spamreader = csv.reader(csvfile) 38 | # headerDictionary = replaceHeaderCodes() 39 | rowsDone = 0 40 | for row in spamreader: 41 | reducedRow = [] 42 | for index in indexList: 43 | reducedRow.append(row[index]) 44 | if reducedRow in reducedRowsList: 45 | print "dupilicat" 46 | else: 47 | spamwriter.writerow(reducedRow) 48 | # print reducedRow 49 | 50 | def columnDicts(infile): 51 | with open(infile, 'rb') as csvfile: 52 | spamreader = csv.reader(csvfile) 53 | headers = spamreader.next() 54 | #print headers 55 | outfile = {} 56 | for row in spamreader: 57 | outfile[row[0]]=row[1] 58 | return outfile 59 | 60 | 61 | def fillInData(infile,outfile): 62 | print "filling in data ..." 63 | rowsDone = 0 64 | with open(outfile,'wb') as outputfile: 65 | w = csv.writer(outputfile) 66 | with open(infile,'rb') as datafile: 67 | r = csv.reader(datafile) 68 | headers = r.next() 69 | print headers 70 | #print currentIndex 71 | for row in r: 72 | rowsDone +=1 73 | if rowsDone%10000==0: 74 | print rowsDone 75 | for i in headers: 76 | currentIndex = headers.index(i) 77 | currentDictionary = columnDicts("category_dictionaries/"+i+'.csv') 78 | if row[currentIndex] in currentDictionary: 79 | if i == "ST": 80 | state = currentDictionary[row[currentIndex]] 81 | greeting = "hi from "+state+"," 82 | row[0]=greeting 83 | row[currentIndex]="" 84 | else: 85 | row[currentIndex]=currentDictionary[row[currentIndex]] 86 | elif i == "CITWP05": 87 | if row[currentIndex]!="" and row[currentIndex]!="-009" and len(row[currentIndex])==4: 88 | #print row[currentIndex] 89 | row[currentIndex] = "I was naturalized in " + str(row[currentIndex])+". " 90 | #print row[currentIndex] 91 | else: 92 | row[currentIndex] ="" 93 | elif i == "JWMNP": 94 | if [row[currentIndex]][0] !="": 95 | commute = int([row[currentIndex]][0]) 96 | if commute < 11: 97 | row[currentIndex] = "it only takes me "+str(commute)+"mins to get to work." 98 | elif commute >10: 99 | row[currentIndex] = "my daily commute is "+str(commute)+"mins long" 100 | elif commute >120: 101 | row[currentIndex] = "It takes me more than 2 hours to get to work. " 102 | else: 103 | row[currentIndex] = "" 104 | else: 105 | row[currentIndex] = "" 106 | 107 | elif i == "MARHYP05": 108 | #check times married 109 | timesMarried = row[headers.index("MARHT")] 110 | if timesMarried == "1" or timesMarried == "I have only been married once. ": 111 | phrase = "I got married in " 112 | row[currentIndex] = phrase+row[currentIndex]+". " 113 | else: 114 | if row[currentIndex] !="" and row[currentIndex]!= " " and len(row[currentIndex])==4: 115 | phrase = "Last time I got married was in " 116 | row[currentIndex] = phrase+row[currentIndex]+". " 117 | else: 118 | row[currentIndex] = "" 119 | #print row[currentIndex], row[headers.index("MARHT")] 120 | elif i == "WKHP": 121 | if row[currentIndex]!="": 122 | hoursWorkedPerWeek = int([row[currentIndex]][0]) 123 | if hoursWorkedPerWeek > 40: 124 | row[currentIndex] = "Usually work "+str(hoursWorkedPerWeek)+"hrs per week. " 125 | else: 126 | row[currentIndex] = "I work less than 40 hours per week. " 127 | else: 128 | row[currentIndex] ="" 129 | 130 | w.writerow(row) 131 | 132 | states = ['ak','usa','usb','usc','usd'] 133 | fileRoot = 'data/ss13p' 134 | for i in range(len(states)): 135 | print i 136 | infile = fileRoot+states[i]+".csv" 137 | outfile = fileRoot+states[i]+"_out.csv" 138 | outfile2 = fileRoot+states[i]+"_filledin.csv" 139 | print infile,outfile,outfile2 140 | reduceDataByColumn(infile,outfile) 141 | fillInData(outfile,outfile2) -------------------------------------------------------------------------------- /draftTweet_byState.py: -------------------------------------------------------------------------------- 1 | import csv 2 | #all headers 3 | #def dataHeaders(): 4 | # with open('alabama.csv', 'rb') as csvfile: 5 | # spamreader = csv.reader(csvfile) 6 | # for row in spamreader: 7 | # return row 8 | # 9 | #headers in current data 10 | #headers_long = ["ST","CIT","CITWP05","CITWP12","COW","DEAR","DEYE","DOUT","DPHY","DREM","ENG","FER","GCL","GCM","GCR","HINS1","HINS2","HINS3","HINS4","HINS5","HINS7","JWMNP","JWRIP","JWTR","LANX","MAR","MARHD","MARHM","MARHT","MARHW","MARHYP05","MARHYP05","MIG","MIL","MLPA","MLPB","MLPCD","MLPE","MLPH","MLPJ","NWLA","NWLK","OIP","PAP","RELP","RETP","SCH","SCHG","SCHL","SEMP","SEX","WKHP","WKL","WKW","YOEP05","YOEP12","ANC","ANC1P05","ANC1P12","ESP","FOD1P","FOD2P","HICOV","INDP","JWAP","JWDP","LANP05","MSP"] 11 | headers = ["JWMNP","CITWP05","MARHYP05","WKHP","WAGP","JWDP","ANC","ST","ANC1P12","FER","JWAP","MARHT","NWLA","CIT","FOD1P","JWRIP","MARHW","NWLK","COW","GCL","JWTR","MIG","SCHL","DEAR","GCM","LANP05","DEYE","HICOV","MAR","MIL","VPS","ENG","HISP","MARHD","NATIVITY","WAOB","ESP","INDP","MARHM","NOP","WKW"] 12 | def dataHeaders(): 13 | with open('alabama.csv', 'rb') as csvfile: 14 | spamreader = csv.reader(csvfile) 15 | for row in spamreader: 16 | return row 17 | 18 | #get headers in use index in all headers 19 | def getHeaderIndex(): 20 | headerDictionary = {} 21 | indexList = [] 22 | for header in headers: 23 | #print header 24 | headerIndex = dataHeaders().index(header) 25 | headerDictionary[header]=headerIndex 26 | indexList.append(headerIndex) 27 | #return headerDictionary 28 | return indexList 29 | 30 | def reduceDataByColumn(infile,outfile): 31 | print "reduce to useful columns ..." 32 | indexList = getHeaderIndex() 33 | reducedRowsList = [] 34 | with open(outfile,'wb') as outputFile: 35 | spamwriter = csv.writer(outputFile) 36 | with open(infile, 'rb') as csvfile: 37 | spamreader = csv.reader(csvfile) 38 | # headerDictionary = replaceHeaderCodes() 39 | rowsDone = 0 40 | for row in spamreader: 41 | reducedRow = [] 42 | for index in indexList: 43 | reducedRow.append(row[index]) 44 | if reducedRow in reducedRowsList: 45 | print "dupilicat" 46 | else: 47 | spamwriter.writerow(reducedRow) 48 | # print reducedRow 49 | 50 | def columnDicts(infile): 51 | with open(infile, 'rb') as csvfile: 52 | spamreader = csv.reader(csvfile) 53 | headers = spamreader.next() 54 | #print headers 55 | outfile = {} 56 | for row in spamreader: 57 | outfile[row[0]]=row[1] 58 | return outfile 59 | 60 | 61 | def fillInData(infile,outfile): 62 | print "filling in data ..." 63 | rowsDone = 0 64 | with open(outfile,'wb') as outputfile: 65 | w = csv.writer(outputfile) 66 | with open(infile,'rb') as datafile: 67 | r = csv.reader(datafile) 68 | headers = r.next() 69 | print headers 70 | #print currentIndex 71 | for row in r: 72 | rowsDone +=1 73 | if rowsDone%10000==0: 74 | print rowsDone 75 | for i in headers: 76 | currentIndex = headers.index(i) 77 | currentDictionary = columnDicts("category_dictionaries/"+i+'.csv') 78 | if row[currentIndex] in currentDictionary: 79 | if i == "ST": 80 | state = currentDictionary[row[currentIndex]] 81 | greeting = "hi from "+state+"," 82 | row[0]=greeting 83 | row[currentIndex]="" 84 | else: 85 | row[currentIndex]=currentDictionary[row[currentIndex]] 86 | elif i == "CITWP05": 87 | if row[currentIndex]!="" and row[currentIndex]!="-009" and len(row[currentIndex])==4: 88 | #print row[currentIndex] 89 | row[currentIndex] = "I was naturalized in " + str(row[currentIndex])+". " 90 | #print row[currentIndex] 91 | else: 92 | row[currentIndex] ="" 93 | elif i == "JWMNP": 94 | if [row[currentIndex]][0] !="": 95 | commute = int([row[currentIndex]][0]) 96 | if commute < 11: 97 | row[currentIndex] = "it only takes me "+str(commute)+"mins to get to work." 98 | elif commute >10: 99 | row[currentIndex] = "my daily commute is "+str(commute)+"mins long" 100 | elif commute >120: 101 | row[currentIndex] = "It takes me more than 2 hours to get to work. " 102 | else: 103 | row[currentIndex] = "" 104 | else: 105 | row[currentIndex] = "" 106 | 107 | elif i == "MARHYP05": 108 | #check times married 109 | timesMarried = row[headers.index("MARHT")] 110 | if timesMarried == "1" or timesMarried == "I have only been married once. ": 111 | phrase = "I got married in " 112 | row[currentIndex] = phrase+row[currentIndex]+". " 113 | else: 114 | if row[currentIndex] !="" and row[currentIndex]!= " " and len(row[currentIndex])==4: 115 | phrase = "Last time I got married was in " 116 | row[currentIndex] = phrase+row[currentIndex]+". " 117 | else: 118 | row[currentIndex] = "" 119 | #print row[currentIndex], row[headers.index("MARHT")] 120 | elif i == "WKHP": 121 | if row[currentIndex]!="": 122 | hoursWorkedPerWeek = int([row[currentIndex]][0]) 123 | if hoursWorkedPerWeek > 40: 124 | row[currentIndex] = "Usually work "+str(hoursWorkedPerWeek)+"hrs per week. " 125 | else: 126 | row[currentIndex] = "I work less than 40 hours per week. " 127 | else: 128 | row[currentIndex] ="" 129 | 130 | w.writerow(row) 131 | 132 | states = ['ak','usa','usb','usc','usd'] 133 | fileRoot = 'data/ss13p' 134 | for i in range(len(states)): 135 | print i 136 | infile = fileRoot+states[i]+".csv" 137 | outfile = fileRoot+states[i]+"_out.csv" 138 | outfile2 = fileRoot+states[i]+"_filledin.csv" 139 | print infile,outfile,outfile2 140 | reduceDataByColumn(infile,outfile) 141 | fillInData(outfile,outfile2) --------------------------------------------------------------------------------