├── requirements.txt ├── ngrams └── readme.txt ├── PredictTruecaser.py ├── .gitignore ├── TrainTruecaser.py ├── README.md ├── Truecaser.py ├── TrainFunctions.py ├── EvaluateTruecaser.py └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk==3.3 2 | six==1.11.0 3 | -------------------------------------------------------------------------------- /ngrams/readme.txt: -------------------------------------------------------------------------------- 1 | Each of the following free n-grams file contains the (approximately) 1,000,000 most frequent n-grams from the Corpus of Contemporary American English (COCA) 2 | 3 | Source: http://www.ngrams.info/download_coca.asp 4 | 5 | -------------------------------------------------------------------------------- /PredictTruecaser.py: -------------------------------------------------------------------------------- 1 | from Truecaser import * 2 | import os 3 | import cPickle 4 | import nltk 5 | import string 6 | import argparse 7 | import fileinput 8 | 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('files', metavar='FILE', nargs='*', help='files to truecase, if empty, STDIN is used') 13 | parser.add_argument('-d', '--distribution_object', help='language distribution file', type=os.path.abspath, required=True) 14 | args = parser.parse_args() 15 | 16 | f = open(args.distribution_object, 'rb') 17 | uniDist = cPickle.load(f) 18 | backwardBiDist = cPickle.load(f) 19 | forwardBiDist = cPickle.load(f) 20 | trigramDist = cPickle.load(f) 21 | wordCasingLookup = cPickle.load(f) 22 | f.close() 23 | 24 | for sentence in fileinput.input(files=args.files): 25 | tokensCorrect = nltk.word_tokenize(sentence) 26 | tokens = [token.lower() for token in tokensCorrect] 27 | tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 28 | print(" ".join(tokensTrueCase)) 29 | 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /TrainTruecaser.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script trains the TrueCase System 3 | """ 4 | import nltk 5 | import nltk.corpus 6 | from nltk.corpus import brown 7 | from nltk.corpus import reuters 8 | import cPickle 9 | import string 10 | import math 11 | import MySQLdb 12 | import MySQLdb.cursors 13 | import nltk.data 14 | 15 | from TrainFunctions import * 16 | from EvaluateTruecaser import defaultTruecaserEvaluation 17 | 18 | 19 | uniDist = nltk.FreqDist() 20 | backwardBiDist = nltk.FreqDist() 21 | forwardBiDist = nltk.FreqDist() 22 | trigramDist = nltk.FreqDist() 23 | wordCasingLookup = {} 24 | 25 | 26 | 27 | 28 | 29 | """ 30 | There are three options to train the true caser: 31 | 1) Use the sentences in NLTK 32 | 2) Use the train.txt file. Each line must contain a single sentence. Use a large corpus, for example Wikipedia 33 | 3) Use Bigrams + Trigrams count from the website http://www.ngrams.info/download_coca.asp 34 | 35 | The more training data, the better the results 36 | """ 37 | 38 | 39 | # :: Option 1: Train it based on NLTK corpus :: 40 | print "Update from NLTK Corpus" 41 | NLTKCorpus = brown.sents()+reuters.sents()+nltk.corpus.semcor.sents()+nltk.corpus.conll2000.sents()+nltk.corpus.state_union.sents() 42 | updateDistributionsFromSentences(NLTKCorpus, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 43 | 44 | # :: Option 2: Train it based the train.txt file :: 45 | """ #Uncomment, if you want to train from train.txt 46 | print "Update from train.txt file" 47 | sentences = [] 48 | for line in open('train.txt'): 49 | sentences.append(line.strip()) 50 | 51 | tokens = [nltk.word_tokenize(sentence) for sentence in sentences] 52 | updateDistributionsFromSentences(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 53 | """ 54 | 55 | # :: Option 3: Train it based ngrams tables from http://www.ngrams.info/download_coca.asp :: 56 | """ #Uncomment, if you want to train from train.txt 57 | print "Update Bigrams / Trigrams" 58 | updateDistributionsFromNgrams('ngrams/w2.txt', 'ngrams/w3.txt', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 59 | """ 60 | 61 | f = open('distributions.obj', 'wb') 62 | cPickle.dump(uniDist, f, protocol=cPickle.HIGHEST_PROTOCOL) 63 | cPickle.dump(backwardBiDist, f, protocol=cPickle.HIGHEST_PROTOCOL) 64 | cPickle.dump(forwardBiDist, f, protocol=cPickle.HIGHEST_PROTOCOL) 65 | cPickle.dump(trigramDist, f, protocol=cPickle.HIGHEST_PROTOCOL) 66 | cPickle.dump(wordCasingLookup, f, protocol=cPickle.HIGHEST_PROTOCOL) 67 | f.close() 68 | 69 | 70 | 71 | # :: Correct sentences :: 72 | 73 | defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Language Independet Truecaser for Python 2 | This is an implementation of a trainable Truecaser for Python. 3 | 4 | A truecaser converts a sentence where the casing was lost to the most probable casing. Use cases are sentences that are in all-upper case, in all-lower case or in title case. 5 | 6 | A model for English is provided, achieving an accuracy of **98.39%** on a small test set of random sentences from Wikipedia. 7 | 8 | # Model 9 | The model was inspired by the paper of [Lucian Vlad Lita et al., tRuEcasIng](https://www.cs.cmu.edu/~llita/papers/lita.truecasing-acl2003.pdf) but with some simplifications. 10 | 11 | The model applies a greed strategy. For each token, from left to right, it computes for all possible casing: 12 | 13 | `score(w_0) = P(w_0) * P(w_0 | w_{-1}) * P(w_0 | w_1) * P(w_0 | w_{-1}, w_1)` 14 | 15 | with `w_0` the word at the current position, `w_{-1}` the previous word, `w_1` the next word in the sentence. 16 | 17 | All observed casings for `w_0` are tested and the casing with the highest score is selected. 18 | 19 | The probabilities `P(...)` are computed based on a large training corpus. 20 | 21 | # Requirements 22 | The Code was written for Python 2.7 and requires NLTK 3.0. 23 | 24 | From NLTK, it uses the functions to spilt sentences into tokens and the FreqDist(). These parts of the code can easily be replaced, so that the code can be used without NLTK install. 25 | 26 | # Run the Code 27 | You need a `distributions.obj` that contains information on the frequencies of unigrams, bigrams, and trigrams. 28 | 29 | A pre-trained `distributions.obj` for English is provided in the [release section](https://github.com/nreimers/truecaser/releases) (name: english_distribitions.obj.zip. Unzip it before you can use it). 30 | 31 | One large `distributions.obj` for English is provided in the download section of github. 32 | 33 | You can train your own `distributions.obj` using the `TrainTruecaser.py` script. 34 | 35 | To run the model on one (or multiple) text files, provide `distributions.obj` to the `PredictTruecase.py` script. If no text file(s) are provided as arguments, input is read from STDIN. 36 | 37 | To evaluate a model, have a look at `EvaluateTruecaser.py`. 38 | 39 | # Train your own Truecaser 40 | You can retrain the Truecaser easily. Simply change the `train.txt` file with a large sample of sentences, change the `TrainTruecaser.py` such that is uses the `train.txt` and run the script. You can also use it for other languages than English like German, Spanish, or French. 41 | 42 | 43 | # Disclaimer 44 | Sorry that this is kind of shitty code without documentation. I was looking for my research for a truecaser, but I couldn't find any working implementation. I implemented this script in a hacky manner and it works quite well (at least for me). 45 | 46 | I think the code is so simple that everyone can use and adapt it and maybe it is handy for someone. The principle behind the code is really simple, but as mentioned above, it achieves good results. 47 | 48 | Hint: The casing of company and product names is the hardest. Train the system on a large and recent dataset to achieve the best results (e.g. on a recent dump of Wikipedia). 49 | -------------------------------------------------------------------------------- /Truecaser.py: -------------------------------------------------------------------------------- 1 | import string 2 | import math 3 | 4 | """ 5 | This file contains the functions to truecase a sentence. 6 | """ 7 | 8 | def getScore(prevToken, possibleToken, nextToken, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): 9 | pseudoCount = 5.0 10 | 11 | #Get Unigram Score 12 | nominator = uniDist[possibleToken]+pseudoCount 13 | denominator = 0 14 | for alternativeToken in wordCasingLookup[possibleToken.lower()]: 15 | denominator += uniDist[alternativeToken]+pseudoCount 16 | 17 | unigramScore = nominator / denominator 18 | 19 | 20 | #Get Backward Score 21 | bigramBackwardScore = 1 22 | if prevToken != None: 23 | nominator = backwardBiDist[prevToken+'_'+possibleToken]+pseudoCount 24 | denominator = 0 25 | for alternativeToken in wordCasingLookup[possibleToken.lower()]: 26 | denominator += backwardBiDist[prevToken+'_'+alternativeToken]+pseudoCount 27 | 28 | bigramBackwardScore = nominator / denominator 29 | 30 | #Get Forward Score 31 | bigramForwardScore = 1 32 | if nextToken != None: 33 | nextToken = nextToken.lower() #Ensure it is lower case 34 | nominator = forwardBiDist[possibleToken+"_"+nextToken]+pseudoCount 35 | denominator = 0 36 | for alternativeToken in wordCasingLookup[possibleToken.lower()]: 37 | denominator += forwardBiDist[alternativeToken+"_"+nextToken]+pseudoCount 38 | 39 | bigramForwardScore = nominator / denominator 40 | 41 | 42 | #Get Trigram Score 43 | trigramScore = 1 44 | if prevToken != None and nextToken != None: 45 | nextToken = nextToken.lower() #Ensure it is lower case 46 | nominator = trigramDist[prevToken+"_"+possibleToken+"_"+nextToken]+pseudoCount 47 | denominator = 0 48 | for alternativeToken in wordCasingLookup[possibleToken.lower()]: 49 | denominator += trigramDist[prevToken+"_"+alternativeToken+"_"+nextToken]+pseudoCount 50 | 51 | trigramScore = nominator / denominator 52 | 53 | result = math.log(unigramScore) + math.log(bigramBackwardScore) + math.log(bigramForwardScore) + math.log(trigramScore) 54 | #print "Scores: %f %f %f %f = %f" % (unigramScore, bigramBackwardScore, bigramForwardScore, trigramScore, math.exp(result)) 55 | 56 | 57 | return result 58 | 59 | def getTrueCase(tokens, outOfVocabularyTokenOption, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): 60 | """ 61 | Returns the true case for the passed tokens. 62 | @param tokens: Tokens in a single sentence 63 | @param outOfVocabulariyTokenOption: 64 | title: Returns out of vocabulary (OOV) tokens in 'title' format 65 | lower: Returns OOV tokens in lower case 66 | as-is: Returns OOV tokens as is 67 | """ 68 | tokensTrueCase = [] 69 | for tokenIdx in xrange(len(tokens)): 70 | token = tokens[tokenIdx] 71 | if token in string.punctuation or token.isdigit(): 72 | tokensTrueCase.append(token) 73 | else: 74 | if token in wordCasingLookup: 75 | if len(wordCasingLookup[token]) == 1: 76 | tokensTrueCase.append(list(wordCasingLookup[token])[0]) 77 | else: 78 | prevToken = tokensTrueCase[tokenIdx-1] if tokenIdx > 0 else None 79 | nextToken = tokens[tokenIdx+1] if tokenIdx < len(tokens)-1 else None 80 | 81 | bestToken = None 82 | highestScore = float("-inf") 83 | 84 | for possibleToken in wordCasingLookup[token]: 85 | score = getScore(prevToken, possibleToken, nextToken, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 86 | 87 | if score > highestScore: 88 | bestToken = possibleToken 89 | highestScore = score 90 | 91 | tokensTrueCase.append(bestToken) 92 | 93 | if tokenIdx == 0: 94 | tokensTrueCase[0] = tokensTrueCase[0].title(); 95 | 96 | else: #Token out of vocabulary 97 | if outOfVocabularyTokenOption == 'title': 98 | tokensTrueCase.append(token.title()) 99 | elif outOfVocabularyTokenOption == 'lower': 100 | tokensTrueCase.append(token.lower()) 101 | else: 102 | tokensTrueCase.append(token) 103 | 104 | return tokensTrueCase -------------------------------------------------------------------------------- /TrainFunctions.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | def getCasing(word): 4 | """ Returns the casing of a word""" 5 | if len(word) == 0: 6 | return 'other' 7 | elif word.isdigit(): #Is a digit 8 | return 'numeric' 9 | elif word.islower(): #All lower case 10 | return 'allLower' 11 | elif word.isupper(): #All upper case 12 | return 'allUpper' 13 | elif word[0].isupper(): #is a title, initial char upper, then all lower 14 | return 'initialUpper' 15 | 16 | return 'other' 17 | 18 | 19 | def checkSentenceSanity(sentence): 20 | """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected""" 21 | caseDist = nltk.FreqDist() 22 | 23 | for token in sentence: 24 | caseDist[getCasing(token)] += 1 25 | 26 | if caseDist.most_common(1)[0][0] != 'allLower': 27 | return False 28 | 29 | return True 30 | 31 | def updateDistributionsFromSentences(text, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): 32 | """ 33 | Updates the NLTK Frequency Distributions based on a list of sentences. 34 | text: Array of sentences. 35 | Each sentence must be an array of Tokens. 36 | """ 37 | # :: Create unigram lookup :: 38 | for sentence in text: 39 | if not checkSentenceSanity(sentence): 40 | continue 41 | 42 | for tokenIdx in xrange(1, len(sentence)): 43 | word = sentence[tokenIdx] 44 | uniDist[word] += 1 45 | 46 | if word.lower() not in wordCasingLookup: 47 | wordCasingLookup[word.lower()] = set() 48 | 49 | wordCasingLookup[word.lower()].add(word) 50 | 51 | 52 | # :: Create backward + forward bigram lookup :: 53 | for sentence in text: 54 | if not checkSentenceSanity(sentence): 55 | continue 56 | 57 | for tokenIdx in xrange(2, len(sentence)): #Start at 2 to skip first word in sentence 58 | word = sentence[tokenIdx] 59 | wordLower = word.lower() 60 | 61 | if wordLower in wordCasingLookup and len(wordCasingLookup[wordLower]) >= 2: #Only if there are multiple options 62 | prevWord = sentence[tokenIdx-1] 63 | 64 | backwardBiDist[prevWord+"_"+word] +=1 65 | 66 | if tokenIdx < len(sentence)-1: 67 | nextWord = sentence[tokenIdx+1].lower() 68 | forwardBiDist[word+"_"+nextWord] += 1 69 | 70 | # :: Create trigram lookup :: 71 | for sentence in text: 72 | if not checkSentenceSanity(sentence): 73 | continue 74 | 75 | for tokenIdx in xrange(2, len(sentence)-1): #Start at 2 to skip first word in sentence 76 | prevWord = sentence[tokenIdx-1] 77 | curWord = sentence[tokenIdx] 78 | curWordLower = curWord.lower() 79 | nextWordLower = sentence[tokenIdx+1].lower() 80 | 81 | if curWordLower in wordCasingLookup and len(wordCasingLookup[curWordLower]) >= 2: #Only if there are multiple options 82 | trigramDist[prevWord+"_"+curWord+"_"+nextWordLower] += 1 83 | 84 | 85 | 86 | 87 | def updateDistributionsFromNgrams(bigramFile, trigramFile, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): 88 | """ 89 | Updates the FrequencyDistribitions based on an ngram file, 90 | e.g. the ngram file of http://www.ngrams.info/download_coca.asp 91 | """ 92 | for line in open(bigramFile): 93 | splits = line.strip().split('\t') 94 | cnt, word1, word2 = splits 95 | cnt = int(cnt) 96 | 97 | # Unigram 98 | if word1.lower() not in wordCasingLookup: 99 | wordCasingLookup[word1.lower()] = set() 100 | 101 | wordCasingLookup[word1.lower()].add(word1) 102 | 103 | if word2.lower() not in wordCasingLookup: 104 | wordCasingLookup[word2.lower()] = set() 105 | 106 | wordCasingLookup[word2.lower()].add(word2) 107 | 108 | 109 | uniDist[word1] += cnt 110 | uniDist[word2] += cnt 111 | 112 | # Bigrams 113 | backwardBiDist[word1+"_"+word2] +=cnt 114 | forwardBiDist[word1+"_"+word2.lower()] += cnt 115 | 116 | 117 | #Tigrams 118 | for line in open(trigramFile): 119 | splits = line.strip().split('\t') 120 | cnt, word1, word2, word3 = splits 121 | cnt = int(cnt) 122 | 123 | trigramDist[word1+"_"+word2+"_"+word3.lower()] += cnt 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /EvaluateTruecaser.py: -------------------------------------------------------------------------------- 1 | from Truecaser import * 2 | import cPickle 3 | import nltk 4 | import string 5 | 6 | 7 | def evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): 8 | correctTokens = 0 9 | totalTokens = 0 10 | 11 | for sentence in testSentences: 12 | tokensCorrect = nltk.word_tokenize(sentence) 13 | tokens = [token.lower() for token in tokensCorrect] 14 | tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 15 | 16 | perfectMatch = True 17 | 18 | for idx in xrange(len(tokensCorrect)): 19 | totalTokens += 1 20 | if tokensCorrect[idx] == tokensTrueCase[idx]: 21 | correctTokens += 1 22 | else: 23 | perfectMatch = False 24 | 25 | if not perfectMatch: 26 | print tokensCorrect 27 | print tokensTrueCase 28 | 29 | print "-------------------" 30 | 31 | 32 | print "Accuracy: %.2f%%" % (correctTokens / float(totalTokens)*100) 33 | 34 | 35 | def defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): 36 | testSentences = [ 37 | "Its website was launched on February 4, 2004 by Mark Zuckerberg with his Harvard College roommates and fellow students Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes." 38 | ,"Facebook is a for-profit corporation and online social networking service based in Menlo Park, California, United States. " 39 | ,"The founders had initially limited the website's membership to Harvard students, but later expanded it to colleges in the Boston area, the Ivy League, and Stanford University. " 40 | ,"It gradually added support for students at various other universities and later to high school students. " 41 | ,"Since 2006, anyone in general aged 13 and older has been allowed to become a registered user of the website, though variations exist in the minimum age requirement, depending on applicable local laws." 42 | ,"Its name comes from the face book directories often given to American university students." 43 | ,"Because of the large volume of data that users submit to the service, Facebook has come under scrutiny for their privacy policies. Facebook, Inc. held its initial public offering in February 2012 and began selling stock to the public three months later, reaching an original peak market capitalization of $104 billion." 44 | ,"Zuckerberg wrote a program called Facemash on October 28, 2003 while attending Harvard University as a sophomore (second year student)." 45 | ,"Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services." 46 | ,"Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, and the Apple Watch smartwatch." 47 | ,"Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites." 48 | ,"Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud." 49 | ,"Microsoft Corporation (commonly referred to as Microsoft) is an American multinational technology company headquartered in Redmond, Washington, that develops, manufactures, licenses, supports and sells computer software, consumer electronics and personal computers and services." 50 | ,"Its best known software products are the Microsoft Windows line of operating systems, Microsoft Office office suite, and Internet Explorer and Edge web browsers." 51 | ,"Its flagship hardware products are the Xbox game consoles and the Microsoft Surface tablet lineup." 52 | ,"It is the world's largest software maker by revenue, and one of the world's most valuable companies." 53 | ,"Google is an American multinational technology company specializing in Internet-related services and products." 54 | ,"These include online advertising technologies, search, cloud computing, and software." 55 | ,"Most of its profits are derived from AdWords, an online advertising service that places advertising near the list of search results." 56 | ,"Rapid growth since incorporation has triggered a chain of products, acquisitions and partnerships beyond Google's core search engine (Google Search)." 57 | ,"It offers online productivity software (Google Docs) including email (Gmail), a cloud storage service (Google Drive) and a social networking service (Google+)." 58 | ,"Desktop products include applications for web browsing (Google Chrome), organizing and editing photos (Google Photos), and instant messaging and video chat (Hangouts)." 59 | ,"The company leads the development of the Android mobile operating system and the browser-only Chrome OS for a class of netbooks known as Chromebooks and desktop PCs known as Chromeboxes." 60 | ,"Google has moved increasingly into communications hardware, partnering with major electronics manufacturers[20] in the production of its \"high-quality low-cost\" Nexus devices." 61 | ,"In 2012, a fiber-optic infrastructure was installed in Kansas City to facilitate a Google Fiber broadband service." 62 | ,"WhatsApp Messenger is a proprietary cross-platform, encrypted, instant messaging client for smartphones." 63 | ,"It uses the Internet to send text messages, documents, images, video, user location and audio messages to other users using standard cellular mobile numbers." 64 | ,"As of February 2016, WhatsApp had a user base of one billion, making it the most popular messaging application." 65 | ,"WhatsApp Inc., based in Mountain View, California, United States, was acquired by Facebook Inc. on February 19, 2014, for approximately US$19.3 billion" 66 | ,"Barack Hussein Obama II (born August 4, 1961) is an American politician serving as the 44th President of the United States." 67 | ,"He is the first African American to hold the office, as well as the first president born outside of the continental United States." 68 | ,"Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review." 69 | ,"He was a community organizer in Chicago before earning his law degree." 70 | ,"He worked as a civil rights attorney and taught constitutional law at University of Chicago Law School between 1992 and 2004." 71 | ,"He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, and ran unsuccessfully in the Democratic primary for the United States House of Representatives in 2000 against incumbent Bobby Rush." 72 | ,"In 2004, Obama received national attention during his campaign to represent Illinois in the United States Senate with his victory in the March Democratic Party primary, his keynote address at the Democratic National Convention in July, and his election to the Senate in November." 73 | ,"He began his presidential campaign in 2007 and, after a close primary campaign against Hillary Clinton in 2008, he won sufficient delegates in the Democratic Party primaries to receive the presidential nomination." 74 | ,"He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009." 75 | ,"Nine months after his inauguration, Obama was named the 2009 Nobel Peace Prize laureate." 76 | ,"Albert Einstein was a German-born theoretical physicist. He developed the general theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics)." 77 | ,"Einstein's work is also known for its influence on the philosophy of science." 78 | ,"Einstein is best known in popular culture for his mass-energy equivalence formula E = mc2 (which has been dubbed \"the world's most famous equation\")." 79 | ,"He received the 1921 Nobel Prize in Physics for his \"services to theoretical physics\", in particular his discovery of the law of the photoelectric effect, a pivotal step in the evolution of quantum theory." 80 | ,"Near the beginning of his career, Einstein thought that Newtonian mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field." 81 | ,"This led to the development of his special theory of relativity." 82 | ,"He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on general relativity." 83 | ,"He continued to deal with problems of statistical mechanics and quantum theory, which led to his explanations of particle theory and the motion of molecules. He also investigated the thermal properties of light which laid the foundation of the photon theory of light." 84 | ,"In 1917, Einstein applied the general theory of relativity to model the large-scale structure of the universe." 85 | ,"Ulm is a city in the federal German state of Baden-Wuerttemberg, situated on the River Danube." 86 | ,"The city, whose population is estimated at almost 120,000 (2015), forms an urban district of its own and is the administrative seat of the Alb-Donau district." 87 | ,"Ulm, founded around 850, is rich in history and traditions as a former Free Imperial City." 88 | ,"Today, it is an economic centre due to its varied industries, and it is the seat of the University of Ulm." 89 | ,"Internationally, Ulm is primarily known for having the church with the tallest steeple in the world (161.53 m or 529.95 ft), the Gothic minster (Ulm Minster) and as the birthplace of Albert Einstein." 90 | ] 91 | 92 | evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 93 | 94 | if __name__ == "__main__": 95 | f = open('english_distributions.obj', 'rb') 96 | uniDist = cPickle.load(f) 97 | backwardBiDist = cPickle.load(f) 98 | forwardBiDist = cPickle.load(f) 99 | trigramDist = cPickle.load(f) 100 | wordCasingLookup = cPickle.load(f) 101 | f.close() 102 | 103 | defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) 104 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------