├── requirements.txt
├── ngrams
    └── readme.txt
├── PredictTruecaser.py
├── .gitignore
├── TrainTruecaser.py
├── README.md
├── Truecaser.py
├── TrainFunctions.py
├── EvaluateTruecaser.py
└── LICENSE


/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk==3.3
2 | six==1.11.0
3 | 


--------------------------------------------------------------------------------
/ngrams/readme.txt:
--------------------------------------------------------------------------------
1 | Each of the following free n-grams file contains the (approximately) 1,000,000 most frequent n-grams from the Corpus of Contemporary American English (COCA)
2 | 
3 | Source: http://www.ngrams.info/download_coca.asp
4 | 
5 | 


--------------------------------------------------------------------------------
/PredictTruecaser.py:
--------------------------------------------------------------------------------
 1 | from Truecaser import *
 2 | import os
 3 | import cPickle
 4 | import nltk
 5 | import string
 6 | import argparse
 7 | import fileinput
 8 | 
 9 |     
10 | if __name__ == "__main__":       
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('files', metavar='FILE', nargs='*', help='files to truecase, if empty, STDIN is used')
13 |     parser.add_argument('-d', '--distribution_object', help='language distribution file', type=os.path.abspath, required=True)
14 |     args = parser.parse_args()
15 | 
16 |     f = open(args.distribution_object, 'rb')
17 |     uniDist = cPickle.load(f)
18 |     backwardBiDist = cPickle.load(f)
19 |     forwardBiDist = cPickle.load(f)
20 |     trigramDist = cPickle.load(f)
21 |     wordCasingLookup = cPickle.load(f)
22 |     f.close()
23 |     
24 |     for sentence in fileinput.input(files=args.files):
25 |         tokensCorrect = nltk.word_tokenize(sentence)
26 |         tokens = [token.lower() for token in tokensCorrect]
27 |         tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
28 |         print(" ".join(tokensTrueCase))
29 | 
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/TrainTruecaser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script trains the TrueCase System
 3 | """
 4 | import nltk
 5 | import nltk.corpus
 6 | from nltk.corpus import brown
 7 | from nltk.corpus import reuters
 8 | import cPickle
 9 | import string
10 | import math
11 | import MySQLdb
12 | import MySQLdb.cursors
13 | import nltk.data
14 | 
15 | from TrainFunctions import *
16 | from EvaluateTruecaser import defaultTruecaserEvaluation
17 | 
18 | 
19 | uniDist = nltk.FreqDist()
20 | backwardBiDist = nltk.FreqDist() 
21 | forwardBiDist = nltk.FreqDist() 
22 | trigramDist = nltk.FreqDist() 
23 | wordCasingLookup = {}
24 | 
25 | 
26 |         
27 |         
28 |         
29 | """
30 | There are three options to train the true caser:
31 | 1) Use the sentences in NLTK
32 | 2) Use the train.txt file. Each line must contain a single sentence. Use a large corpus, for example Wikipedia
33 | 3) Use Bigrams + Trigrams count from the website http://www.ngrams.info/download_coca.asp
34 | 
35 | The more training data, the better the results
36 | """
37 |          
38 | 
39 | # :: Option 1: Train it based on NLTK corpus ::
40 | print "Update from NLTK Corpus"
41 | NLTKCorpus = brown.sents()+reuters.sents()+nltk.corpus.semcor.sents()+nltk.corpus.conll2000.sents()+nltk.corpus.state_union.sents()
42 | updateDistributionsFromSentences(NLTKCorpus, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
43 | 
44 | # :: Option 2: Train it based the train.txt file ::
45 | """ #Uncomment, if you want to train from train.txt
46 | print "Update from train.txt file"
47 | sentences = []
48 | for line in open('train.txt'):        
49 |     sentences.append(line.strip())
50 |     
51 | tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
52 | updateDistributionsFromSentences(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
53 | """     
54 |    
55 | # :: Option 3: Train it based ngrams tables from http://www.ngrams.info/download_coca.asp ::    
56 | """ #Uncomment, if you want to train from train.txt
57 | print "Update Bigrams / Trigrams"
58 | updateDistributionsFromNgrams('ngrams/w2.txt', 'ngrams/w3.txt', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
59 | """
60 | 
61 | f = open('distributions.obj', 'wb')
62 | cPickle.dump(uniDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
63 | cPickle.dump(backwardBiDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
64 | cPickle.dump(forwardBiDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
65 | cPickle.dump(trigramDist, f, protocol=cPickle.HIGHEST_PROTOCOL)
66 | cPickle.dump(wordCasingLookup, f, protocol=cPickle.HIGHEST_PROTOCOL)
67 | f.close()
68 | 
69 | 
70 |         
71 | # :: Correct sentences ::
72 | 
73 | defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
74 |         
75 |         
76 | 
77 | 
78 | 
79 |         
80 |         


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Language Independet Truecaser for Python
 2 | This is an implementation of a trainable Truecaser for Python.
 3 | 
 4 | A truecaser converts a sentence where the casing was lost to the most probable casing. Use cases are sentences that are in all-upper case, in all-lower case or in title case.
 5 | 
 6 | A model for English is provided, achieving an accuracy of **98.39%** on a small test set of random sentences from Wikipedia.
 7 | 
 8 | # Model
 9 | The model was inspired by the paper of [Lucian Vlad Lita  et al., tRuEcasIng](https://www.cs.cmu.edu/~llita/papers/lita.truecasing-acl2003.pdf) but with some simplifications.
10 | 
11 | The model applies a greed strategy. For each token, from left to right, it computes for all possible casing:
12 | 
13 | `score(w_0) = P(w_0) * P(w_0 | w_{-1}) * P(w_0 | w_1) * P(w_0 | w_{-1}, w_1)`
14 | 
15 | with `w_0` the word at the current position, `w_{-1}` the previous word, `w_1` the next word in the sentence.
16 | 
17 | All observed casings for `w_0` are tested and the casing with the highest score is selected.
18 | 
19 | The probabilities `P(...)` are computed based on a large training corpus.
20 | 
21 | # Requirements
22 | The Code was written for Python 2.7 and requires NLTK 3.0.
23 | 
24 | From NLTK, it uses the functions to spilt sentences into tokens and the FreqDist(). These parts of the code can easily be replaced, so that the code can be used without NLTK install.
25 | 
26 | # Run the Code
27 | You need a `distributions.obj` that contains information on the frequencies of unigrams, bigrams, and trigrams. 
28 | 
29 | A pre-trained `distributions.obj` for English is provided in the [release section](https://github.com/nreimers/truecaser/releases) (name: english_distribitions.obj.zip. Unzip it before you can use it).
30 | 
31 | One large `distributions.obj` for English is provided in the download section of github.
32 | 
33 | You can train your own `distributions.obj` using the `TrainTruecaser.py` script.
34 | 
35 | To run the model on one (or multiple) text files, provide `distributions.obj` to the `PredictTruecase.py` script. If no text file(s) are provided as arguments, input is read from STDIN.
36 | 
37 | To evaluate a model, have a look at `EvaluateTruecaser.py`. 
38 | 
39 | # Train your own Truecaser
40 | You can retrain the Truecaser easily. Simply change the `train.txt` file with a large sample of sentences, change the `TrainTruecaser.py` such that is uses the `train.txt` and run the script. You can also use it for other languages than English like German, Spanish, or French.
41 | 
42 | 
43 | # Disclaimer
44 | Sorry that this is kind of shitty code without documentation. I was looking for my research for a truecaser, but I couldn't find any working implementation. I implemented this script in a hacky manner and it works quite well (at least for me).
45 | 
46 | I think the code is so simple that everyone can use and adapt it and maybe it is handy for someone. The principle behind the code is really simple, but as mentioned above, it achieves good results.
47 | 
48 | Hint: The casing of company and product names is the hardest. Train the system on a large and recent dataset to achieve the best results (e.g. on a recent dump of Wikipedia).
49 | 


--------------------------------------------------------------------------------
/Truecaser.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | import math
  3 | 
  4 | """
  5 | This file contains the functions to truecase a sentence.
  6 | """
  7 | 
  8 | def getScore(prevToken, possibleToken, nextToken, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
  9 |     pseudoCount = 5.0
 10 |     
 11 |     #Get Unigram Score
 12 |     nominator = uniDist[possibleToken]+pseudoCount    
 13 |     denominator = 0    
 14 |     for alternativeToken in wordCasingLookup[possibleToken.lower()]:
 15 |         denominator += uniDist[alternativeToken]+pseudoCount
 16 |         
 17 |     unigramScore = nominator / denominator
 18 |         
 19 |         
 20 |     #Get Backward Score  
 21 |     bigramBackwardScore = 1
 22 |     if prevToken != None:  
 23 |         nominator = backwardBiDist[prevToken+'_'+possibleToken]+pseudoCount
 24 |         denominator = 0    
 25 |         for alternativeToken in wordCasingLookup[possibleToken.lower()]:
 26 |             denominator += backwardBiDist[prevToken+'_'+alternativeToken]+pseudoCount
 27 |             
 28 |         bigramBackwardScore = nominator / denominator
 29 |         
 30 |     #Get Forward Score  
 31 |     bigramForwardScore = 1
 32 |     if nextToken != None:  
 33 |         nextToken = nextToken.lower() #Ensure it is lower case
 34 |         nominator = forwardBiDist[possibleToken+"_"+nextToken]+pseudoCount
 35 |         denominator = 0    
 36 |         for alternativeToken in wordCasingLookup[possibleToken.lower()]:
 37 |             denominator += forwardBiDist[alternativeToken+"_"+nextToken]+pseudoCount
 38 |             
 39 |         bigramForwardScore = nominator / denominator
 40 |         
 41 |         
 42 |     #Get Trigram Score  
 43 |     trigramScore = 1
 44 |     if prevToken != None and nextToken != None:  
 45 |         nextToken = nextToken.lower() #Ensure it is lower case
 46 |         nominator = trigramDist[prevToken+"_"+possibleToken+"_"+nextToken]+pseudoCount
 47 |         denominator = 0    
 48 |         for alternativeToken in wordCasingLookup[possibleToken.lower()]:
 49 |             denominator += trigramDist[prevToken+"_"+alternativeToken+"_"+nextToken]+pseudoCount
 50 |             
 51 |         trigramScore = nominator / denominator
 52 |         
 53 |     result = math.log(unigramScore) + math.log(bigramBackwardScore) + math.log(bigramForwardScore) + math.log(trigramScore)
 54 |     #print "Scores: %f %f %f %f = %f" % (unigramScore, bigramBackwardScore, bigramForwardScore, trigramScore, math.exp(result))
 55 |   
 56 |   
 57 |     return result
 58 | 
 59 | def getTrueCase(tokens, outOfVocabularyTokenOption, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
 60 |     """
 61 |     Returns the true case for the passed tokens.
 62 |     @param tokens: Tokens in a single sentence
 63 |     @param outOfVocabulariyTokenOption:
 64 |         title: Returns out of vocabulary (OOV) tokens in 'title' format
 65 |         lower: Returns OOV tokens in lower case
 66 |         as-is: Returns OOV tokens as is
 67 |     """
 68 |     tokensTrueCase = []
 69 |     for tokenIdx in xrange(len(tokens)):
 70 |         token = tokens[tokenIdx]
 71 |         if token in string.punctuation or token.isdigit():
 72 |             tokensTrueCase.append(token)
 73 |         else:
 74 |             if token in wordCasingLookup:
 75 |                 if len(wordCasingLookup[token]) == 1:
 76 |                     tokensTrueCase.append(list(wordCasingLookup[token])[0])
 77 |                 else:
 78 |                     prevToken = tokensTrueCase[tokenIdx-1] if tokenIdx > 0  else None
 79 |                     nextToken = tokens[tokenIdx+1] if tokenIdx < len(tokens)-1 else None
 80 |                     
 81 |                     bestToken = None
 82 |                     highestScore = float("-inf")
 83 |                     
 84 |                     for possibleToken in wordCasingLookup[token]:
 85 |                         score = getScore(prevToken, possibleToken, nextToken, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
 86 |                            
 87 |                         if score > highestScore:
 88 |                             bestToken = possibleToken
 89 |                             highestScore = score
 90 |                         
 91 |                     tokensTrueCase.append(bestToken)
 92 |                     
 93 |                 if tokenIdx == 0:
 94 |                     tokensTrueCase[0] = tokensTrueCase[0].title();
 95 |                     
 96 |             else: #Token out of vocabulary
 97 |                 if outOfVocabularyTokenOption == 'title':
 98 |                     tokensTrueCase.append(token.title())
 99 |                 elif outOfVocabularyTokenOption == 'lower':
100 |                     tokensTrueCase.append(token.lower())
101 |                 else:
102 |                     tokensTrueCase.append(token) 
103 |     
104 |     return tokensTrueCase


--------------------------------------------------------------------------------
/TrainFunctions.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | 
  3 | def getCasing(word):  
  4 |     """ Returns the casing of a word"""
  5 |     if len(word) == 0:
  6 |         return 'other'
  7 |     elif word.isdigit(): #Is a digit
  8 |         return 'numeric'
  9 |     elif word.islower(): #All lower case
 10 |         return 'allLower'
 11 |     elif word.isupper(): #All upper case
 12 |         return 'allUpper'
 13 |     elif word[0].isupper(): #is a title, initial char upper, then all lower
 14 |         return 'initialUpper'
 15 |     
 16 |     return 'other'
 17 | 
 18 | 
 19 | def checkSentenceSanity(sentence):
 20 |     """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
 21 |     caseDist = nltk.FreqDist()
 22 |     
 23 |     for token in sentence:
 24 |         caseDist[getCasing(token)] += 1
 25 |     
 26 |     if caseDist.most_common(1)[0][0] != 'allLower':        
 27 |         return False
 28 |     
 29 |     return True
 30 | 
 31 | def updateDistributionsFromSentences(text, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
 32 |     """
 33 |     Updates the NLTK Frequency Distributions based on a list of sentences.
 34 |     text: Array of sentences.
 35 |     Each sentence must be an array of Tokens.
 36 |     """
 37 |     # :: Create unigram lookup ::
 38 |     for sentence in text:
 39 |         if not checkSentenceSanity(sentence):
 40 |             continue
 41 |         
 42 |         for tokenIdx in xrange(1, len(sentence)):
 43 |             word = sentence[tokenIdx]
 44 |             uniDist[word] += 1
 45 |                         
 46 |             if word.lower() not in wordCasingLookup:
 47 |                 wordCasingLookup[word.lower()] = set()
 48 |             
 49 |             wordCasingLookup[word.lower()].add(word)
 50 |             
 51 |     
 52 |     # :: Create backward + forward bigram lookup ::
 53 |     for sentence in text:
 54 |         if not checkSentenceSanity(sentence):
 55 |             continue
 56 |         
 57 |         for tokenIdx in xrange(2, len(sentence)): #Start at 2 to skip first word in sentence
 58 |             word = sentence[tokenIdx]
 59 |             wordLower = word.lower()
 60 |             
 61 |             if wordLower in wordCasingLookup and len(wordCasingLookup[wordLower]) >= 2: #Only if there are multiple options
 62 |                 prevWord = sentence[tokenIdx-1]
 63 |                 
 64 |                 backwardBiDist[prevWord+"_"+word] +=1
 65 |                 
 66 |                 if tokenIdx < len(sentence)-1:
 67 |                     nextWord = sentence[tokenIdx+1].lower()
 68 |                     forwardBiDist[word+"_"+nextWord] += 1
 69 |                     
 70 |     # :: Create trigram lookup ::
 71 |     for sentence in text:
 72 |         if not checkSentenceSanity(sentence):
 73 |             continue
 74 |         
 75 |         for tokenIdx in xrange(2, len(sentence)-1): #Start at 2 to skip first word in sentence
 76 |             prevWord = sentence[tokenIdx-1]
 77 |             curWord = sentence[tokenIdx]
 78 |             curWordLower = curWord.lower()
 79 |             nextWordLower = sentence[tokenIdx+1].lower()
 80 |             
 81 |             if curWordLower in wordCasingLookup and len(wordCasingLookup[curWordLower]) >= 2: #Only if there are multiple options   
 82 |                 trigramDist[prevWord+"_"+curWord+"_"+nextWordLower] += 1
 83 |     
 84 |             
 85 |  
 86 | 
 87 | def updateDistributionsFromNgrams(bigramFile, trigramFile, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
 88 |     """
 89 |     Updates the FrequencyDistribitions based on an ngram file,
 90 |     e.g. the ngram file of http://www.ngrams.info/download_coca.asp
 91 |     """
 92 |     for line in open(bigramFile):
 93 |         splits = line.strip().split('\t')
 94 |         cnt, word1, word2 = splits
 95 |         cnt = int(cnt)
 96 |         
 97 |         # Unigram
 98 |         if word1.lower() not in wordCasingLookup:
 99 |             wordCasingLookup[word1.lower()] = set()
100 |             
101 |         wordCasingLookup[word1.lower()].add(word1)
102 |         
103 |         if word2.lower() not in wordCasingLookup:
104 |             wordCasingLookup[word2.lower()] = set()
105 |             
106 |         wordCasingLookup[word2.lower()].add(word2)
107 |         
108 |         
109 |         uniDist[word1] += cnt
110 |         uniDist[word2] += cnt
111 |         
112 |         # Bigrams
113 |         backwardBiDist[word1+"_"+word2] +=cnt
114 |         forwardBiDist[word1+"_"+word2.lower()] += cnt
115 |         
116 |         
117 |     #Tigrams
118 |     for line in open(trigramFile):
119 |         splits = line.strip().split('\t')
120 |         cnt, word1, word2, word3 = splits
121 |         cnt = int(cnt)
122 |         
123 |         trigramDist[word1+"_"+word2+"_"+word3.lower()] += cnt
124 |         
125 |         
126 | 
127 |         
128 | 


--------------------------------------------------------------------------------
/EvaluateTruecaser.py:
--------------------------------------------------------------------------------
  1 | from Truecaser import *
  2 | import cPickle
  3 | import nltk
  4 | import string
  5 | 
  6 | 
  7 | def evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
  8 |     correctTokens = 0
  9 |     totalTokens = 0
 10 |     
 11 |     for sentence in testSentences:
 12 |         tokensCorrect = nltk.word_tokenize(sentence)
 13 |         tokens = [token.lower() for token in tokensCorrect]
 14 |         tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
 15 |         
 16 |         perfectMatch = True
 17 |         
 18 |         for idx in xrange(len(tokensCorrect)):
 19 |             totalTokens += 1
 20 |             if tokensCorrect[idx] == tokensTrueCase[idx]:
 21 |                 correctTokens += 1
 22 |             else:
 23 |                 perfectMatch = False
 24 |         
 25 |         if not perfectMatch:
 26 |             print tokensCorrect
 27 |             print tokensTrueCase
 28 |         
 29 |             print "-------------------"
 30 |     
 31 | 
 32 |     print "Accuracy: %.2f%%" % (correctTokens / float(totalTokens)*100)
 33 |     
 34 |     
 35 | def defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
 36 |     testSentences = [
 37 |         "Its website was launched on February 4, 2004 by Mark Zuckerberg with his Harvard College roommates and fellow students Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes."
 38 |         ,"Facebook is a for-profit corporation and online social networking service based in Menlo Park, California, United States. "
 39 |         ,"The founders had initially limited the website's membership to Harvard students, but later expanded it to colleges in the Boston area, the Ivy League, and Stanford University. "
 40 |         ,"It gradually added support for students at various other universities and later to high school students. "
 41 |         ,"Since 2006, anyone in general aged 13 and older has been allowed to become a registered user of the website, though variations exist in the minimum age requirement, depending on applicable local laws."
 42 |         ,"Its name comes from the face book directories often given to American university students."
 43 |         ,"Because of the large volume of data that users submit to the service, Facebook has come under scrutiny for their privacy policies. Facebook, Inc. held its initial public offering in February 2012 and began selling stock to the public three months later, reaching an original peak market capitalization of $104 billion."
 44 |         ,"Zuckerberg wrote a program called Facemash on October 28, 2003 while attending Harvard University as a sophomore (second year student)."
 45 |         ,"Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services."
 46 |         ,"Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, and the Apple Watch smartwatch."
 47 |         ,"Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites."
 48 |         ,"Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud."
 49 |         ,"Microsoft Corporation (commonly referred to as Microsoft) is an American multinational technology company headquartered in Redmond, Washington, that develops, manufactures, licenses, supports and sells computer software, consumer electronics and personal computers and services."
 50 |         ,"Its best known software products are the Microsoft Windows line of operating systems, Microsoft Office office suite, and Internet Explorer and Edge web browsers."
 51 |         ,"Its flagship hardware products are the Xbox game consoles and the Microsoft Surface tablet lineup."
 52 |         ,"It is the world's largest software maker by revenue, and one of the world's most valuable companies."
 53 |         ,"Google is an American multinational technology company specializing in Internet-related services and products."
 54 |         ,"These include online advertising technologies, search, cloud computing, and software."
 55 |         ,"Most of its profits are derived from AdWords, an online advertising service that places advertising near the list of search results."
 56 |         ,"Rapid growth since incorporation has triggered a chain of products, acquisitions and partnerships beyond Google's core search engine (Google Search)."
 57 |         ,"It offers online productivity software (Google Docs) including email (Gmail), a cloud storage service (Google Drive) and a social networking service (Google+)."
 58 |         ,"Desktop products include applications for web browsing (Google Chrome), organizing and editing photos (Google Photos), and instant messaging and video chat (Hangouts)."
 59 |         ,"The company leads the development of the Android mobile operating system and the browser-only Chrome OS for a class of netbooks known as Chromebooks and desktop PCs known as Chromeboxes."
 60 |         ,"Google has moved increasingly into communications hardware, partnering with major electronics manufacturers[20] in the production of its \"high-quality low-cost\" Nexus devices."
 61 |         ,"In 2012, a fiber-optic infrastructure was installed in Kansas City to facilitate a Google Fiber broadband service."
 62 |         ,"WhatsApp Messenger is a proprietary cross-platform, encrypted, instant messaging client for smartphones."
 63 |         ,"It uses the Internet to send text messages, documents, images, video, user location and audio messages to other users using standard cellular mobile numbers."
 64 |         ,"As of February 2016, WhatsApp had a user base of one billion, making it the most popular messaging application."
 65 |         ,"WhatsApp Inc., based in Mountain View, California, United States, was acquired by Facebook Inc. on February 19, 2014, for approximately US$19.3 billion"
 66 |         ,"Barack Hussein Obama II (born August 4, 1961) is an American politician serving as the 44th President of the United States."
 67 |         ,"He is the first African American to hold the office, as well as the first president born outside of the continental United States."
 68 |         ,"Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review."
 69 |         ,"He was a community organizer in Chicago before earning his law degree."
 70 |         ,"He worked as a civil rights attorney and taught constitutional law at University of Chicago Law School between 1992 and 2004."
 71 |         ,"He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, and ran unsuccessfully in the Democratic primary for the United States House of Representatives in 2000 against incumbent Bobby Rush."
 72 |         ,"In 2004, Obama received national attention during his campaign to represent Illinois in the United States Senate with his victory in the March Democratic Party primary, his keynote address at the Democratic National Convention in July, and his election to the Senate in November."
 73 |         ,"He began his presidential campaign in 2007 and, after a close primary campaign against Hillary Clinton in 2008, he won sufficient delegates in the Democratic Party primaries to receive the presidential nomination."
 74 |         ,"He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009."
 75 |         ,"Nine months after his inauguration, Obama was named the 2009 Nobel Peace Prize laureate."
 76 |         ,"Albert Einstein was a German-born theoretical physicist. He developed the general theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics)."
 77 |         ,"Einstein's work is also known for its influence on the philosophy of science."
 78 |         ,"Einstein is best known in popular culture for his mass-energy equivalence formula E = mc2 (which has been dubbed \"the world's most famous equation\")."
 79 |         ,"He received the 1921 Nobel Prize in Physics for his \"services to theoretical physics\", in particular his discovery of the law of the photoelectric effect, a pivotal step in the evolution of quantum theory."
 80 |         ,"Near the beginning of his career, Einstein thought that Newtonian mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field."
 81 |         ,"This led to the development of his special theory of relativity."
 82 |         ,"He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on general relativity."
 83 |         ,"He continued to deal with problems of statistical mechanics and quantum theory, which led to his explanations of particle theory and the motion of molecules. He also investigated the thermal properties of light which laid the foundation of the photon theory of light."
 84 |         ,"In 1917, Einstein applied the general theory of relativity to model the large-scale structure of the universe."
 85 |         ,"Ulm is a city in the federal German state of Baden-Wuerttemberg, situated on the River Danube."
 86 |         ,"The city, whose population is estimated at almost 120,000 (2015), forms an urban district of its own and is the administrative seat of the Alb-Donau district."
 87 |         ,"Ulm, founded around 850, is rich in history and traditions as a former Free Imperial City."
 88 |         ,"Today, it is an economic centre due to its varied industries, and it is the seat of the University of Ulm."
 89 |         ,"Internationally, Ulm is primarily known for having the church with the tallest steeple in the world (161.53 m or 529.95 ft), the Gothic minster (Ulm Minster) and as the birthplace of Albert Einstein."
 90 |     ]
 91 |     
 92 |     evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
 93 |     
 94 | if __name__ == "__main__":       
 95 |     f = open('english_distributions.obj', 'rb')
 96 |     uniDist = cPickle.load(f)
 97 |     backwardBiDist = cPickle.load(f)
 98 |     forwardBiDist = cPickle.load(f)
 99 |     trigramDist = cPickle.load(f)
100 |     wordCasingLookup = cPickle.load(f)
101 |     f.close()
102 |     
103 |     defaultTruecaserEvaluation(wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
104 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------