├── .gitignore ├── circle.yml ├── crf ├── testdata │ ├── train_data_3.txt │ ├── train_data_4.txt │ ├── train_data_1.txt │ └── train_data_2.txt ├── crf_trainer_test.go ├── crf_trainer.go ├── crf_test.go └── crf.go ├── .vscode └── launch.json ├── README.md ├── bin └── ci_test.sh ├── LICENSE └── labelling ├── labelling.go └── labelling_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | *.test -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | test: 2 | override: 3 | - sh ./bin/ci_test.sh -------------------------------------------------------------------------------- /crf/testdata/train_data_3.txt: -------------------------------------------------------------------------------- 1 | label1 label2 label3 2 | 3 | 1 cup water 4 | label1 label2 5 | -------------------------------------------------------------------------------- /crf/testdata/train_data_4.txt: -------------------------------------------------------------------------------- 1 | label1 label2 label3 2 | 3 | 1 cup water 4 | label1 label2 label5 5 | -------------------------------------------------------------------------------- /crf/testdata/train_data_1.txt: -------------------------------------------------------------------------------- 1 | quantity unit name comment 2 | 3 | 1/4 cup milk 4 | quantity unit name 5 | 6 | 3 large eggs 7 | quantity name name 8 | 9 | 5 peeled carrots 10 | quantity comment name 11 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug Tests", 6 | "type": "go", 7 | "request": "launch", 8 | "mode": "test", 9 | "remotePath": "", 10 | "port": 2345, 11 | "host": "127.0.0.1", 12 | "program": "${workspaceRoot}", 13 | "env": {}, 14 | "args": [], 15 | "showLog": true 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP 2 | 3 | [![CircleCI](https://circleci.com/gh/chriscasola/nlp.svg?style=svg)](https://circleci.com/gh/chriscasola/nlp) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/chriscasola/nlp)](https://goreportcard.com/report/github.com/chriscasola/nlp) 5 | [![GoDoc](https://godoc.org/github.com/chriscasola/nlp?status.svg)](https://godoc.org/github.com/chriscasola/nlp) 6 | 7 | NLP is a go package meant to contain implementations of common natural language processing algorithms. So far there 8 | is a naive implementation of conditional random fields. 9 | 10 | ## CRF 11 | 12 | The CRF implementation draws from the following articles: 13 | 14 | * [Introduction to Conditional Random Fields](http://blog.echen.me/2012/01/03/introduction-to-conditional-random-fields/) 15 | * [An Introduction to Conditional Random Fields](http://homepages.inf.ed.ac.uk/csutton/publications/crftutv2.pdf) 16 | * [An Introduction to Conditional Random Fields for Relational Learning](https://people.cs.umass.edu/~mccallum/papers/crf-tutorial.pdf) 17 | -------------------------------------------------------------------------------- /bin/ci_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | go vet ./... 4 | go get -u github.com/jstemmer/go-junit-report 5 | mkdir -p $CIRCLE_TEST_REPORTS/junit 6 | mkdir -p $CIRCLE_ARTIFACTS/coverage 7 | cd .. 8 | mkdir -p .go_workspace/src/github.com/chriscasola/nlp 9 | cp -R nlp .go_workspace/src/github.com/chriscasola/ 10 | cd .go_workspace/src/github.com/chriscasola/nlp 11 | go test -v ./... | go-junit-report > $CIRCLE_TEST_REPORTS/junit/report.xml 12 | cd crf 13 | go test -covermode=count -coverprofile=$CIRCLE_ARTIFACTS/coverage_crf.out ./... 14 | go tool cover -html=$CIRCLE_ARTIFACTS/coverage_crf.out -o $CIRCLE_ARTIFACTS/coverage_crf.html 15 | go tool cover -func=$CIRCLE_ARTIFACTS/coverage_crf.out -o $CIRCLE_ARTIFACTS/coverage_crf.txt 16 | cd ../labelling 17 | go test -covermode=count -coverprofile=$CIRCLE_ARTIFACTS/coverage_labelling.out ./... 18 | go tool cover -html=$CIRCLE_ARTIFACTS/coverage_labelling.out -o $CIRCLE_ARTIFACTS/coverage_labelling.html 19 | go tool cover -func=$CIRCLE_ARTIFACTS/coverage_labelling.out -o $CIRCLE_ARTIFACTS/coverage_labelling.txt 20 | cd .. 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Chris Casola 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /labelling/labelling.go: -------------------------------------------------------------------------------- 1 | package labelling 2 | 3 | // FeatureFunction is a feature function for the model 4 | type FeatureFunction func(sentence []string, i int, labelCurr string, labelPrev string) float64 5 | 6 | // Feature includes the weight and feature function for a feature 7 | type Feature struct { 8 | Weight float64 9 | Value FeatureFunction 10 | } 11 | 12 | // FindBestLabelling determines the best labeling for the given sentence 13 | func FindBestLabelling(sentence []string, labels []string, features []Feature) []string { 14 | labelling := make([]string, 0) 15 | 16 | for i := 0; i < len(sentence); i++ { 17 | bestScore, bestLabel, currentScore := -1.0, "", 0.0 18 | prevLabel := "" 19 | if i > 0 { 20 | prevLabel = labelling[i-1] 21 | } 22 | 23 | for j := 0; j < len(labels); j++ { 24 | for k := 0; k < len(features); k++ { 25 | currentScore += (features[k].Weight * features[k].Value(sentence, i, labels[j], prevLabel)) 26 | } 27 | 28 | if currentScore > bestScore { 29 | bestScore = currentScore 30 | bestLabel = labels[j] 31 | } 32 | 33 | currentScore = 0 34 | } 35 | 36 | labelling = append(labelling, bestLabel) 37 | } 38 | 39 | return labelling 40 | } 41 | -------------------------------------------------------------------------------- /labelling/labelling_test.go: -------------------------------------------------------------------------------- 1 | package labelling 2 | 3 | import ( 4 | "reflect" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | func stringInArray(list []string, s string) bool { 10 | for i := 0; i < len(list); i++ { 11 | if strings.ToLower(list[i]) == strings.ToLower(s) { 12 | return true 13 | } 14 | } 15 | 16 | return false 17 | } 18 | 19 | func isQuantityAtBeginning(sentence []string, i int, labelCurr string, labelPrev string) float64 { 20 | if i == 0 && stringInArray([]string{"1", "2", "3", "4", "5", "6", "7", "8", "9"}, sentence[i]) && labelCurr == "quantity" { 21 | return 1 22 | } 23 | 24 | return 0 25 | } 26 | 27 | func unitFollowsQuantity(sentence []string, i int, labelCurr string, labelPrev string) float64 { 28 | if labelPrev == "quantity" && labelCurr == "units" { 29 | return 1 30 | } 31 | 32 | return 0 33 | } 34 | 35 | func ingredientFollowsUnit(sentence []string, i int, labelCurr string, labelPrev string) float64 { 36 | if labelPrev == "units" && labelCurr == "ingredient" { 37 | return 1 38 | } 39 | 40 | return 0 41 | } 42 | 43 | func TestFindBestLabelling(t *testing.T) { 44 | sentence := strings.Split("1 cup apples", " ") 45 | labels := []string{"quantity", "units", "ingredient"} 46 | features := []Feature{ 47 | {1.0, isQuantityAtBeginning}, 48 | {1.0, unitFollowsQuantity}, 49 | {1.0, ingredientFollowsUnit}, 50 | } 51 | bestLabelling := FindBestLabelling(sentence, labels, features) 52 | 53 | expectedLabelling := []string{"quantity", "units", "ingredient"} 54 | 55 | if reflect.DeepEqual(bestLabelling, expectedLabelling) != true { 56 | t.Errorf("Expected %v to equal %v", bestLabelling, expectedLabelling) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /crf/crf_trainer_test.go: -------------------------------------------------------------------------------- 1 | package crf 2 | 3 | import "testing" 4 | import "reflect" 5 | 6 | func TestLoadTrainingData(t *testing.T) { 7 | labels, sentences, err := LoadTrainingData("./testdata/train_data_1.txt") 8 | 9 | if err != nil { 10 | t.Errorf("Unexpected error: %v", err) 11 | return 12 | } 13 | 14 | expectedLabels := []Label{"quantity", "unit", "name", "comment"} 15 | 16 | expectedSentences := []Sentence{ 17 | {Words: []string{"1/4", "cup", "milk"}, Labeling: SentenceLabeling{Labels: []Label{"quantity", "unit", "name"}}}, 18 | {Words: []string{"3", "large", "eggs"}, Labeling: SentenceLabeling{Labels: []Label{"quantity", "name", "name"}}}, 19 | {Words: []string{"5", "peeled", "carrots"}, Labeling: SentenceLabeling{Labels: []Label{"quantity", "comment", "name"}}}, 20 | } 21 | 22 | if reflect.DeepEqual(labels, expectedLabels) != true { 23 | t.Errorf("Expected %v to equal %v", labels, expectedLabels) 24 | } 25 | 26 | if reflect.DeepEqual(sentences, expectedSentences) != true { 27 | t.Errorf("Expected %v to equal %v", sentences, expectedSentences) 28 | } 29 | } 30 | 31 | func TestLoadTrainingDataWithErrors(t *testing.T) { 32 | cases := map[string]string{ 33 | "./testdata/train_data_3.txt": "not enough labels (line 4)", 34 | "./testdata/train_data_4.txt": "invalid label (line 4)", 35 | } 36 | 37 | for filePath, expectedError := range cases { 38 | _, _, err := LoadTrainingData(filePath) 39 | 40 | if err == nil { 41 | t.Errorf("Expected error for bad training data") 42 | return 43 | } 44 | 45 | if err.Error() != expectedError { 46 | t.Errorf("Expected \"%v\" to be \"%v\"", err.Error(), expectedError) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /crf/crf_trainer.go: -------------------------------------------------------------------------------- 1 | package crf 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "strings" 8 | ) 9 | 10 | // LoadTrainingData loads training data from a file. The first line of the file should 11 | // be a space separate list of all possible labels. The following lines should 12 | // alternate between a training sentence and the labeling for that sentence. Blank 13 | // lines will be ignored. Below is an example: 14 | // 15 | // 1/4 cup milk 16 | // quantity unit ingredient 17 | // 18 | // 3 large eggs 19 | // quantity ingredient ingredient 20 | // 21 | func LoadTrainingData(filename string) ([]Label, []Sentence, error) { 22 | file, err := os.Open(filename) 23 | 24 | if err != nil { 25 | return nil, nil, fmt.Errorf("Unable to open data file: %v", err) 26 | } 27 | 28 | defer file.Close() 29 | 30 | lineNum := 0 31 | result := make([]Sentence, 0) 32 | labelSet := make([]Label, 0) 33 | scanner := bufio.NewScanner(file) 34 | var currentSentence *Sentence 35 | 36 | if scanner.Scan() { 37 | lineNum++ 38 | labels := strings.Split(scanner.Text(), " ") 39 | for _, label := range labels { 40 | if label != "" { 41 | labelSet = append(labelSet, Label(label)) 42 | } 43 | } 44 | } 45 | 46 | for scanner.Scan() { 47 | lineNum++ 48 | line := scanner.Text() 49 | if line == "" { 50 | continue 51 | } 52 | 53 | if currentSentence == nil { 54 | currentSentence = MakeSentence(line) 55 | } else { 56 | sentenceLabels := removeEmptyString(strings.Split(line, " ")) 57 | if len(sentenceLabels) != len(currentSentence.Words) { 58 | return nil, nil, fmt.Errorf("not enough labels (line %v)", lineNum) 59 | } 60 | for _, label := range sentenceLabels { 61 | if !labelExists(labelSet, label) { 62 | return nil, nil, fmt.Errorf("invalid label (line %v)", lineNum) 63 | } 64 | currentSentence.Labeling.Labels = append(currentSentence.Labeling.Labels, Label(label)) 65 | } 66 | result = append(result, *currentSentence) 67 | currentSentence = nil 68 | } 69 | } 70 | 71 | if err := scanner.Err(); err != nil { 72 | return nil, nil, fmt.Errorf("Error reading training file: %v", err) 73 | } 74 | 75 | return labelSet, result, nil 76 | } 77 | 78 | func labelExists(labels []Label, label string) bool { 79 | for i := 0; i < len(labels); i++ { 80 | if string(labels[i]) == label { 81 | return true 82 | } 83 | } 84 | return false 85 | } 86 | -------------------------------------------------------------------------------- /crf/crf_test.go: -------------------------------------------------------------------------------- 1 | package crf 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | "reflect" 7 | "strings" 8 | "testing" 9 | ) 10 | 11 | func featureFuncA(s []string, i int, labelCurr Label, labelPrev Label) bool { 12 | if i%2 == 0 { 13 | return true 14 | } 15 | 16 | return false 17 | } 18 | 19 | func featureFuncB(s []string, i int, labelCurr Label, labelPrev Label) bool { 20 | if strings.ToLower(s[i])[0] == 't' { 21 | return true 22 | } 23 | 24 | return false 25 | } 26 | 27 | func featureFuncC(s []string, i int, labelCurr Label, labelPrev Label) bool { 28 | if labelCurr != labelPrev && labelPrev != "" { 29 | return true 30 | } 31 | 32 | return false 33 | } 34 | 35 | func featureFuncD(s []string, i int, labelCurr Label, labelPrev Label) bool { 36 | if labelCurr == "B" && labelPrev == "A" { 37 | return true 38 | } 39 | 40 | return false 41 | } 42 | 43 | var featureA = Feature{Weight: 0.25, Value: featureFuncA} 44 | var featureB = Feature{Weight: 0.75, Value: featureFuncB} 45 | var featureC = Feature{Weight: 0.75, Value: featureFuncC} 46 | var featureD = Feature{Weight: 0.25, Value: featureFuncD} 47 | 48 | func TestScoreLabeling(t *testing.T) { 49 | sentenceAFeatures := make([]Feature, 0) 50 | sentenceAFeatures = append(sentenceAFeatures, featureA) 51 | sentenceAFeatures = append(sentenceAFeatures, featureB) 52 | 53 | sentenceA := MakeSentence("This is a test sentence") 54 | labelingA1 := SentenceLabeling{Labels: []Label{"A", "B", "A", "B", "A"}} 55 | 56 | if score := sentenceA.ScoreLabeling(&labelingA1, sentenceAFeatures); score != math.Exp(2.25) { 57 | t.Errorf("Score is %v but should be %v", score, math.Exp(2.25)) 58 | } 59 | } 60 | 61 | func TestGetAllPossibleLabelings(t *testing.T) { 62 | result := getAllPossibleLabelings([]string{"the", "fat", "cat"}, []Label{"a", "b"}) 63 | expected := []SentenceLabeling{ 64 | {Labels: []Label{"a", "a", "a"}}, 65 | {Labels: []Label{"a", "a", "b"}}, 66 | {Labels: []Label{"a", "b", "a"}}, 67 | {Labels: []Label{"a", "b", "b"}}, 68 | {Labels: []Label{"b", "a", "a"}}, 69 | {Labels: []Label{"b", "a", "b"}}, 70 | {Labels: []Label{"b", "b", "a"}}, 71 | {Labels: []Label{"b", "b", "b"}}, 72 | } 73 | 74 | if reflect.DeepEqual(result, expected) != true { 75 | t.Errorf("Expected %v to be %v", result, expected) 76 | } 77 | 78 | result = getAllPossibleLabelings([]string{"the", "fat"}, []Label{"a", "b", "c"}) 79 | expected = []SentenceLabeling{ 80 | {Labels: []Label{"a", "a"}}, 81 | {Labels: []Label{"a", "b"}}, 82 | {Labels: []Label{"a", "c"}}, 83 | {Labels: []Label{"b", "a"}}, 84 | {Labels: []Label{"b", "b"}}, 85 | {Labels: []Label{"b", "c"}}, 86 | {Labels: []Label{"c", "a"}}, 87 | {Labels: []Label{"c", "b"}}, 88 | {Labels: []Label{"c", "c"}}, 89 | } 90 | 91 | if reflect.DeepEqual(result, expected) != true { 92 | t.Errorf("Expected %v to be %v", result, expected) 93 | } 94 | } 95 | 96 | func TestCalculateBestLabeling(t *testing.T) { 97 | sentenceAFeatures := make([]Feature, 0) 98 | sentenceAFeatures = append(sentenceAFeatures, featureC) 99 | sentenceAFeatures = append(sentenceAFeatures, featureD) 100 | 101 | sentenceA := MakeSentence("This is a test sentence") 102 | sentenceA.CalculateBestLabeling(sentenceAFeatures, []Label{"A", "B"}) 103 | 104 | expected := []Label{"A", "B", "A", "B", "A"} 105 | result := sentenceA.Labeling.Labels 106 | if reflect.DeepEqual(result, expected) != true { 107 | t.Errorf("Expected %v to be %v", result, expected) 108 | } 109 | } 110 | 111 | func TestGetRandomWeights(t *testing.T) { 112 | rand.Seed(1) 113 | randWeights := getRandomWeights(5) 114 | expected := []float64{0.19682432385076745, 0.3061472185456322, 0.21632243051716366, 0.14248132487885407, 0.1382247022075827} 115 | if reflect.DeepEqual(randWeights, expected) != true { 116 | t.Errorf("Expected %v to be %v", randWeights, expected) 117 | } 118 | 119 | sum := float64(0) 120 | for _, num := range randWeights { 121 | sum += num 122 | } 123 | 124 | if sum != 1 { 125 | t.Errorf("Expected %v to be 1", sum) 126 | } 127 | } 128 | 129 | func QuantityAtBeginning(s []string, i int, labelCurr Label, labelPrev Label) bool { 130 | if i == 0 && labelCurr == "quantity" { 131 | return true 132 | } 133 | 134 | return false 135 | } 136 | 137 | func UnitAfterQuantity(s []string, i int, labelCurr Label, labelPrev Label) bool { 138 | if labelPrev == "quantity" && labelCurr == "unit" { 139 | return true 140 | } 141 | 142 | return false 143 | } 144 | 145 | func TestLearnWeights(t *testing.T) { 146 | t.Skip("Skipping until performance can be improved") 147 | features := []Feature{ 148 | Feature{Value: QuantityAtBeginning}, 149 | Feature{Value: UnitAfterQuantity}, 150 | } 151 | labels, trainingData, err := LoadTrainingData("./testdata/train_data_2.txt") 152 | 153 | if err != nil { 154 | t.Errorf("Unexpected error reading test data file: %v", err) 155 | return 156 | } 157 | 158 | LearnWeights(features, labels, trainingData) 159 | } 160 | -------------------------------------------------------------------------------- /crf/testdata/train_data_2.txt: -------------------------------------------------------------------------------- 1 | quantity unit name comment separator other 2 | 3 | 1/4 cup milk 4 | quantity unit name 5 | 6 | 3 large eggs 7 | quantity name name 8 | 9 | 5 peeled carrots 10 | quantity comment name 11 | 12 | 2 tablespoon soy sauce 13 | quantity unit name name 14 | 15 | 1 tablespoon dry sherry 16 | quantity unit name name 17 | 18 | 1 dash sesame oil 19 | quantity unit name name 20 | 21 | 2 tablespoon all-purpose flour 22 | quantity unit name name 23 | 24 | 2 tablespoon cornstarch 25 | quantity unit name 26 | 27 | 1/4 teaspoon baking powder 28 | quantity unit name name 29 | 30 | 1/4 teaspoon baking soda 31 | quantity unit name name 32 | 33 | 1 teaspoon canola oil 34 | quantity unit name name 35 | 36 | 4 (5 ounce) skinless, boneless chicken breast halves, cut into 1-inch cubes 37 | quantity unit unit name name name name name comment comment comment comment 38 | 39 | 1 quart vegetable oil for frying 40 | quantity unit name name comment comment 41 | 42 | 1/2 cup water 43 | quantity unit name 44 | 45 | 1 cup chicken broth 46 | quantity unit name name 47 | 48 | 1/4 cup distilled white vinegar 49 | quantity unit name name name 50 | 51 | 1/4 cup cornstarch 52 | quantity unit name 53 | 54 | 1 teaspoon red chile paste (such as Thai Kitchen) 55 | quantity unit name name name comment comment comment comment 56 | 57 | 1 clove garlic, minced 58 | quantity name name comment 59 | 60 | 2 tablespoon toasted sesame seeds 61 | quantity unit comment name name 62 | 63 | 3 zucchinis - ends trimmed, halved, and cut into 1/2-inch strips 64 | quantity name separator comment comment comment comment comment comment comment comment 65 | 66 | 2 eggs, beaten 67 | quantity name comment 68 | 69 | cooking spray 70 | name name 71 | 72 | 1/4 cup dry milk powder 73 | quantity unit name name name 74 | 75 | 3 tablespoon white sugar 76 | quantity unit name name 77 | 78 | 1 cup warm water (110 degrees F/45 degrees C) 79 | quantity unit comment name comment comment comment comment comment 80 | 81 | 2 tablespoon butter, softened 82 | quantity unit name comment 83 | 84 | 1 (.25 ounce) package active dry yeast 85 | quantity unit unit unit name name name 86 | 87 | 1 egg white 88 | quantity name name 89 | 90 | 1/4 cup all-purpose flour for coating 91 | quantity unit name name comment comment 92 | 93 | 1/4 teaspoon ground black pepper 94 | quantity unit name name name 95 | 96 | 1 tablespoon finely chopped Chinese chives 97 | quantity unit comment comment name name 98 | 99 | 1 teaspoon chile-garlic sauce (such as Sriracha) 100 | quantity unit name name comment comment comment 101 | 102 | 3 clove garlic, minced 103 | quantity unit name comment 104 | 105 | 1 egg, beaten 106 | quantity name comment 107 | 108 | 1 tablespoon minced fresh ginger 109 | quantity unit comment other name 110 | 111 | 50 dumpling wrappers 112 | quantity name name 113 | 114 | 1 cup vegetable oil for frying 115 | quantity unit name name comment comment 116 | 117 | 1 quart water, or more as needed 118 | quantity unit name comment comment comment comment 119 | 120 | 1 fresh jalapeno pepper, finely diced 121 | quantity other name name comment comment 122 | 123 | 1 bunch asparagus, trimmed and cut into 1/4-inch pieces 124 | quantity unit name comment comment comment comment comment comment 125 | 126 | 1 cup shelled fresh or thawed frozen peas 127 | quantity unit comment other other other name name 128 | 129 | 2 tablespoon chopped fresh mint 130 | quantity unit comment other name 131 | 132 | salt and freshly ground black pepper to taste 133 | other other other other other other other other 134 | 135 | 2 (4 ounce) filet mignon steaks 136 | quantity unit unit name name name 137 | 138 | 1/2 teaspoon freshly ground black pepper to taste 139 | quantity unit comment comment name name comment comment 140 | 141 | salt to taste 142 | name comment comment 143 | 144 | 1/4 cup dry red wine 145 | quantity unit name name name 146 | 147 | 6 cup chicken broth, divided 148 | quantity unit name name comment 149 | 150 | 3 tablespoon olive oil, divided 151 | quantity unit name name comment 152 | 153 | 1 pound portobello mushrooms, thinly sliced 154 | quantity unit name name comment comment 155 | 156 | 1 pound white mushrooms, thinly sliced 157 | quantity unit name name comment comment 158 | 159 | 2 shallots, diced 160 | quantity name comment 161 | 162 | 1 1/2 cup Arborio rice 163 | quantity quantity unit name name 164 | 165 | sea salt to taste 166 | name name comment comment 167 | 168 | freshly ground black pepper to taste 169 | comment comment name name comment comment 170 | 171 | 3 tablespoon finely chopped chives 172 | quantity unit comment comment name 173 | 174 | 4 tablespoon butter 175 | quantity unit name 176 | 177 | 1/3 cup freshly grated Parmesan cheese 178 | quantity unit comment comment name name 179 | 180 | 2 (14.5 ounce) cans Italian-style diced tomatoes 181 | quantity unit unit unit name name name 182 | 183 | 1 (19 ounce) can cannellini beans, drained and rinsed 184 | quantity unit unit unit name name comment comment comment 185 | 186 | 10 ounces fresh spinach, washed and chopped 187 | quantity unit other name comment comment comment 188 | 189 | 8 ounces penne pasta 190 | quantity unit name name 191 | 192 | 1/2 cup crumbled feta cheese 193 | quantity unit comment name name 194 | 195 | 1 pound sweet Italian sausage, casings removed 196 | quantity unit name name name comment comment 197 | 198 | 1 cup chopped onion 199 | quantity unit comment name 200 | 201 | 2 clove garlic, minced 202 | quantity unit name comment 203 | 204 | 5 cup beef broth 205 | quantity unit name name 206 | 207 | 1/2 cup water 208 | quantity unit name 209 | 210 | 4 large tomatoes - peeled, seeded and chopped 211 | quantity name name separator comment comment comment comment 212 | 213 | 1 cup thinly sliced carrots 214 | quantity unit comment comment name 215 | 216 | 1/2 tablespoon packed fresh basil leaves 217 | quantity unit unit other name name 218 | 219 | 1/2 teaspoon dried oregano 220 | quantity unit other name 221 | 222 | 1 1/2 cup sliced zucchini 223 | quantity quantity unit comment name 224 | -------------------------------------------------------------------------------- /crf/crf.go: -------------------------------------------------------------------------------- 1 | package crf 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | "strings" 8 | ) 9 | 10 | // Label is a label applied to a word in a sentence 11 | type Label string 12 | 13 | // FeatureFunction is a feature function for linear-chain CRF 14 | type FeatureFunction func(s []string, i int, labelCurr Label, labelPrev Label) bool 15 | 16 | // Feature includes the weight and feature function for a CRF feature 17 | type Feature struct { 18 | Weight float64 19 | Value FeatureFunction 20 | } 21 | 22 | // EvaluateFeature evalutes the score of a given labeling using the feature function 23 | func (f *Feature) EvaluateFeature(s []string, labeling *SentenceLabeling) float64 { 24 | score := float64(0) 25 | 26 | if len(s) != len(labeling.Labels) { 27 | panic(fmt.Sprintf("Misaligned labels for \"%v\" labeled with \"%v\"\n", s, labeling.Labels)) 28 | } 29 | 30 | for i := range s { 31 | var val bool 32 | if i == 0 { 33 | val = f.Value(s, i, labeling.Labels[i], "") 34 | } else { 35 | val = f.Value(s, i, labeling.Labels[i], labeling.Labels[i-1]) 36 | } 37 | 38 | if val { 39 | score += f.Weight 40 | } 41 | } 42 | 43 | return score 44 | } 45 | 46 | // SentenceLabeling is a specific order of labels for a sentence 47 | type SentenceLabeling struct { 48 | Labels []Label 49 | Score float64 50 | Probability float64 51 | } 52 | 53 | // Sentence is a sentence to be processed using CRF 54 | type Sentence struct { 55 | Words []string 56 | Labeling SentenceLabeling 57 | } 58 | 59 | // MakeSentence makes a new Sentence with the given sentence and features 60 | func MakeSentence(sentence string) *Sentence { 61 | return &Sentence{Words: removeEmptyString(strings.Split(sentence, " "))} 62 | } 63 | 64 | // ScoreLabeling determines the score of a given labeling of the sentence 65 | func (s *Sentence) ScoreLabeling(labeling *SentenceLabeling, features []Feature) float64 { 66 | score := float64(0) 67 | 68 | for _, feature := range features { 69 | score += feature.EvaluateFeature(s.Words, labeling) 70 | } 71 | 72 | return math.Exp(score) 73 | } 74 | 75 | func recursivelyLabelWord(words []string, allLabels []Label, appliedLabels []Label) []SentenceLabeling { 76 | var result []SentenceLabeling 77 | if len(words) == len(appliedLabels) { 78 | result = append(result, SentenceLabeling{Labels: appliedLabels}) 79 | return result 80 | } 81 | 82 | for _, label := range allLabels { 83 | restLabels := append(appliedLabels, label) 84 | subResult := recursivelyLabelWord(words, allLabels, restLabels) 85 | for _, r := range subResult { 86 | result = append(result, r) 87 | } 88 | } 89 | 90 | return result 91 | } 92 | 93 | func getAllPossibleLabelings(words []string, labels []Label) []SentenceLabeling { 94 | var result []SentenceLabeling 95 | 96 | for _, label := range labels { 97 | restLabels := []Label{label} 98 | subResult := recursivelyLabelWord(words, labels, restLabels) 99 | for _, r := range subResult { 100 | result = append(result, r) 101 | } 102 | } 103 | 104 | return result 105 | } 106 | 107 | func (s *Sentence) scoreAllLabelings(features []Feature, labels []Label) []SentenceLabeling { 108 | labelings := getAllPossibleLabelings(s.Words, labels) 109 | 110 | for i := range labelings { 111 | labelings[i].Score = s.ScoreLabeling(&labelings[i], features) 112 | } 113 | 114 | return labelings 115 | } 116 | 117 | func calculateNormalizationConstant(labelings []SentenceLabeling) float64 { 118 | sum := float64(0) 119 | 120 | for _, labeling := range labelings { 121 | sum += labeling.Score 122 | } 123 | 124 | return sum 125 | } 126 | 127 | func (s *Sentence) calculateLabelProbabilities(features []Feature, labels []Label) []SentenceLabeling { 128 | labelings := s.scoreAllLabelings(features, labels) 129 | normalizationConstant := calculateNormalizationConstant(labelings) 130 | 131 | for i := range labelings { 132 | labelings[i].Probability = labelings[i].Score / normalizationConstant 133 | } 134 | 135 | return labelings 136 | } 137 | 138 | // CalculateBestLabeling determines the best labeling of the sentence 139 | func (s *Sentence) CalculateBestLabeling(features []Feature, labels []Label) { 140 | labelings := s.calculateLabelProbabilities(features, labels) 141 | 142 | currentBestLabel := labelings[0] 143 | 144 | for _, labeling := range labelings { 145 | if labeling.Probability > currentBestLabel.Probability { 146 | currentBestLabel = labeling 147 | } 148 | } 149 | 150 | s.Labeling = currentBestLabel 151 | } 152 | 153 | // LearnWeights attempts to learn the weight to use for each of the given feature functions 154 | // using the provided labels and training data 155 | func LearnWeights(features []Feature, labels []Label, trainingData []Sentence) { 156 | randomWeights := getRandomWeights(len(features)) 157 | 158 | // assign random weights to each feature function 159 | for i := 0; i < len(features); i++ { 160 | features[i].Weight = randomWeights[i] 161 | } 162 | 163 | // loop through all of the training sentences 164 | for i := 0; i < len(trainingData); i++ { 165 | fmt.Printf("Analyzing sentence: %v\n", trainingData[i].Words) 166 | const threshold = float64(0.01) 167 | const learningRate = float64(1) 168 | lastChange := float64(1) 169 | 170 | // keep moving the weights until they coalesce on a value 171 | for lastChange > threshold { 172 | possibleLabelings := getAllPossibleLabelings(trainingData[i].Words, labels) 173 | 174 | // loop through each feature function and calculate the difference between the contribution 175 | // of the feature function for the correct labeling and the contribution of the feature function 176 | // given the current model 177 | for j := 0; j < len(features); j++ { 178 | trueValue := features[j].EvaluateFeature(trainingData[i].Words, &trainingData[i].Labeling) 179 | expectedContribution := float64(0) 180 | 181 | for k := 0; k < len(possibleLabelings); k++ { 182 | expectedContribution += possibleLabelings[k].Probability * features[j].EvaluateFeature(trainingData[i].Words, &possibleLabelings[k]) 183 | } 184 | 185 | // calculate gradient of the log probability of the training example 186 | gradProb := trueValue - expectedContribution 187 | lastChange = learningRate * gradProb 188 | features[j].Weight += lastChange 189 | } 190 | } 191 | } 192 | } 193 | 194 | func getRandomWeights(num int) []float64 { 195 | randomNumbers := make([]float64, num) 196 | sum := float64(0) 197 | for i := 0; i < num; i++ { 198 | randomNumbers[i] = rand.Float64() 199 | sum += randomNumbers[i] 200 | } 201 | for i := 0; i < num; i++ { 202 | randomNumbers[i] = randomNumbers[i] / sum 203 | } 204 | 205 | return randomNumbers 206 | } 207 | 208 | func removeEmptyString(arr []string) []string { 209 | if arr == nil { 210 | return arr 211 | } 212 | result := make([]string, 0) 213 | for i := 0; i < len(arr); i++ { 214 | if arr[i] != "" { 215 | result = append(result, arr[i]) 216 | } 217 | } 218 | return result 219 | } 220 | --------------------------------------------------------------------------------