├── TweetProcessor.py ├── Ngram.py └── GlobalDistance1.py /TweetProcessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #title :TweetProcessor.py 3 | #description :The program extracts the tweet texts, converts them to lowercase and removes all non-alphabetic characters except spaces. 4 | #author :Natasha A Thomas 5 | #date :20140905 6 | #usage :python TweetProcessor.py 7 | #notes : 8 | #python_version :2.7.6 9 | #============================================================================= 10 | 11 | import re 12 | import sys 13 | 14 | inputFile = open (str((sys.argv)[1]), 'r') 15 | outputFile = open ('PreprocessedTweets.txt', 'w') 16 | for line in inputFile: 17 | pattern = re.compile('[^a-zA-Z ]*') #regular expression to identify all non-alphabetic characters except spaces 18 | line = re.sub (pattern, '', line) #patterns which match the RE are replaced by a ' ' 19 | line = line.lower() 20 | outputFile.write(line + '\n') 21 | 22 | inputFile.close() 23 | outputFile.close() 24 | -------------------------------------------------------------------------------- /Ngram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #title :Ngram.py 3 | #description :The program uses N-Gram Distance Matching Strategy to find approximate matches for location names in tweet texts. 4 | #author :Natasha A Thomas 5 | #date :20140905 6 | #usage :python Ngram.py 7 | #notes : 8 | #python_version :2.7.6 9 | #============================================================================= 10 | 11 | import re 12 | 13 | queryFile = open ("PreprocessedGeonames.txt", "r") 14 | opFile = open ("NgramData.txt", "w") 15 | #numresults = 0 16 | n = 4 #value of n to form ngrams 17 | threshold = 75 #threshold for similarity percentage 18 | for query in queryFile: 19 | #query = "los angeles" 20 | querylist = query.split() 21 | numNgramsPattern = len(zip(*[''.join(querylist)[i:] for i in range(n)])) #list of ngrams in the pattern query 22 | lineCount = 0 23 | ipFile = open ("PreprocessedTweets.txt", "r") 24 | for line in ipFile: 25 | #line = "los angeles movie theatre" 26 | words = line.split() 27 | strings = zip(*[words[i:] for i in range(len(querylist))]) #list of strings with the same number of tokens as the query 28 | for token in strings: 29 | string = ''.join(token) 30 | ngrams = zip(*[string[i:] for i in range(n)]) #list of ngrams in the string 31 | numNgrams = len(ngrams) 32 | count = 0 33 | for ngram in ngrams: 34 | ng = ''.join(ngram) 35 | if re.search(ng, ''.join(querylist)): #searching for the presence of ngram in the pattern 36 | count = count + 1 37 | if (numNgrams != 0 and (count * 100/numNgramsPattern) > threshold): 38 | opFile.write("Query: ") 39 | opFile.write(query + '\n') 40 | opFile.write("Approx. match: ") 41 | opFile.write(' '.join(token) + '\n') 42 | tweets = open("Tweets.txt", "r") 43 | tweetCount = 0 44 | for tweet in tweets: 45 | if (tweetCount == lineCount): 46 | opFile.write("Tweet ID: ") 47 | opFile.write(str(tweet.split('\t')[1]) + '\n') 48 | break 49 | tweetCount = tweetCount + 1 50 | opFile.write("Tweet: ") 51 | opFile.write(line + '\n\n') 52 | #numresults = numresults + 1 53 | tweets.close() 54 | lineCount = lineCount + 1 55 | #if numresults == 20: 56 | #break 57 | ipFile.close() 58 | #break 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /GlobalDistance1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #title :GlobalDistance1.py 3 | #description :The program uses Global Edit Distance Matching Strategy to find approximate matches for location names in tweet texts. This version considers multi-word queries as lists of separate words. 4 | #author :Natasha A Thomas 5 | #date :20140905 6 | #usage :python GlobalDistance1.py 7 | #notes : 8 | #python_version :2.7.6 9 | #============================================================================= 10 | 11 | def editDistance(str1, str2, weight=1): 12 | len1 = len(str1) 13 | len2 = len(str2) 14 | matrix = [[0] * (len2 + 1) for j in range(len1 + 1)] 15 | for i in xrange(len1 + 1): 16 | for j in xrange(len2 + 1): 17 | if min(i, j) == 0: 18 | matrix[i][j] = max(i, j) 19 | else: 20 | addValue = 0 if str1[i-1] == str2[j-1] else weight 21 | matrix[i][j] = min(matrix[i-1][j-1] + addValue, 22 | matrix[i-1][j] + 1, 23 | matrix[i][j-1] + 1) 24 | return matrix[len1][len2] 25 | 26 | 27 | queryFile = open ("PreprocessedGeonames.txt", "r") 28 | opFile = open ("Data1.txt", "w") 29 | #numresults = 0 30 | threshold = 40 #threshold for dissimilarity percentage 31 | for query in queryFile: 32 | query = "los angeles" 33 | count = 0 34 | querylist = query.split() 35 | ipFile = open ("PreprocessedTweets.txt", "r") 36 | for line in ipFile: 37 | names = line.split() 38 | tokens = zip(*[names[i:] for i in range(len(querylist))]) #list of strings with the same number of tokens as the query 39 | for i in range(0, len(tokens)): 40 | dissimilarity = [] 41 | for j in range(0, len(querylist)): 42 | dissimilarity.append (((editDistance (querylist[j], tokens[i][j])) * 100)/len(querylist[j])) 43 | overallDissimilarity = sum(dissimilarity)/len(dissimilarity) 44 | if (overallDissimilarity < threshold): 45 | opFile.write("Query: ") 46 | opFile.write(query + '\n') 47 | opFile.write("Approx. match: ") 48 | opFile.write(' '.join(tokens[i]) + '\n') 49 | tweets = open("Tweets.txt", "r") 50 | tweetCount = 0 51 | for tweet in tweets: 52 | if (tweetCount == count): 53 | opFile.write("Tweet ID: ") 54 | opFile.write(str(tweet.split('\t')[1]) + '\n') 55 | break 56 | tweetCount = tweetCount + 1 57 | opFile.write("Tweet: ") 58 | opFile.write(line + '\n\n') 59 | #numresults = numresults + 1 60 | tweets.close() 61 | count = count + 1 62 | #if numresults == 20: 63 | #break 64 | ipFile.close() 65 | break 66 | 67 | 68 | 69 | --------------------------------------------------------------------------------