├── data └── .gitkeep ├── .gitignore ├── go.mod ├── renovate.json ├── .github └── workflows │ └── ci.yaml ├── evaluation_test.go ├── sentence.go ├── sentence_test.go ├── feature_test.go ├── decoder.go ├── decoder_test.go ├── Makefile ├── action.go ├── state_test.go ├── word.go ├── LICENSE ├── evaluation.go ├── go.sum ├── state.go ├── action_test.go ├── reader.go ├── README.md ├── perceptron_test.go ├── main.go ├── perceptron.go └── feature.go /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/.gitkeep 2 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module go-easy-first 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/olekukonko/tablewriter v0.0.5 7 | github.com/urfave/cli/v2 v2.3.0 8 | ) 9 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "github>syou6162/renovate-config" 5 | ] 6 | } -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Test 8 | runs-on: ubuntu-16.04 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@master 12 | - name: Build 13 | env: 14 | GOPATH: /home/runner/work/ 15 | run: make deps bindata build 16 | -------------------------------------------------------------------------------- /evaluation_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDependencyAccuracy(t *testing.T) { 8 | g1 := []int{1, 2, 3} 9 | g2 := []int{1, 2, 3} 10 | g3 := []int{1, 2, 3, 4} 11 | g := [][]int{g1, g2, g3} 12 | 13 | p1 := []int{1, 2, 30} 14 | p2 := []int{1, 2, 30} 15 | p3 := []int{1, 2, 3, 40} 16 | p := [][]int{p1, p2, p3} 17 | 18 | if a, _ := dependencyAccuracy(g, p); a != 0.7 { 19 | t.Error("dependency accuracy must be 0.7") 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /sentence.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | type Sentence struct { 4 | words []*Word 5 | } 6 | 7 | // extract heads without root for evaluation 8 | func (sent *Sentence) ExtractHeads() []int { 9 | heads := make([]int, 0) 10 | for _, w := range sent.words[1:] { 11 | heads = append(heads, w.head) 12 | } 13 | return heads 14 | } 15 | 16 | func (sent *Sentence) ExtractPredictedHeads() []int { 17 | heads := make([]int, 0) 18 | for _, w := range sent.words[1:] { 19 | heads = append(heads, w.predHead) 20 | } 21 | return heads 22 | } 23 | -------------------------------------------------------------------------------- /sentence_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestExtractHeads(t *testing.T) { 9 | words := make([]*Word, 0) 10 | words = append(words, 11 | makeRootWord(), 12 | makeWord("ms.", "NNP", 1, 2), 13 | makeWord("hang", "NNP", 2, 3), 14 | makeWord("plays", "VBZ", 3, 0), 15 | makeWord("elianti", "NNP", 4, 3), 16 | makeWord(".", ".", 5, 3), 17 | ) 18 | sent := Sentence{words: words} 19 | head := sent.ExtractHeads() 20 | 21 | if !reflect.DeepEqual(head, []int{2, 3, 0, 3, 3}) { 22 | t.Error("head extraction seems wrong") 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /feature_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestAddUnigramFeatures(t *testing.T) { 8 | words := make([]*Word, 0) 9 | words = append(words, 10 | makeRootWord(), 11 | makeWord("ms.", "NNP", 1, 2), 12 | makeWord("hang", "NNP", 2, 3), 13 | makeWord("plays", "VBZ", 3, 0), 14 | makeWord("elianti", "NNP", 4, 3), 15 | makeWord(".", ".", 5, 3), 16 | ) 17 | s := NewState(words) 18 | features := make([]int, 0) 19 | AddUnigramFeatures(&features, s, "left", 1) 20 | 21 | if len(features) == 0 { 22 | t.Error("length of features must be greater than 0") 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /decoder.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func isFinished(state *State) bool { 4 | return len(state.pending) == 1 5 | } 6 | 7 | func decode(weight *[]float64, state *State) { 8 | if isFinished(state) { 9 | // Do nothing 10 | } else { 11 | pair := BestActionIndexPair(weight, state) 12 | pair.action(state, pair.index) 13 | state.ResetFvCache(pair.index) 14 | decode(weight, state) 15 | } 16 | } 17 | 18 | func Decode(weight *[]float64, sent *Sentence) { 19 | s := NewState(sent.words) 20 | decode(weight, s) 21 | 22 | for child, parent := range s.arcs { 23 | sent.words[child].predHead = parent 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /decoder_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDecode(t *testing.T) { 8 | words := make([]*Word, 0) 9 | words = append(words, 10 | makeRootWord(), 11 | makeWord("ms.", "NNP", 1, 2), 12 | makeWord("hang", "NNP", 2, 3), 13 | makeWord("plays", "VBZ", 3, 0), 14 | makeWord("elianti", "NNP", 4, 3), 15 | makeWord(".", ".", 5, 3), 16 | ) 17 | sent := Sentence{words: words} 18 | weight := make([]float64, MaxFeatureLength) 19 | 20 | s := NewState(sent.words) 21 | decode(&weight, s) 22 | if len(s.arcs) == 0 { 23 | t.Error("length of arcs must be greater than 0") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COVERAGE = coverage.out 2 | 3 | all: build 4 | 5 | .PHONY: deps 6 | deps: 7 | go get github.com/mattn/goveralls 8 | go get -u github.com/jteeuwen/go-bindata/... 9 | 10 | .PHONY: bindata 11 | bindata: 12 | ${GOPATH}/bin/go-bindata -ignore='\.gitkeep' data 13 | 14 | .PHONY: build 15 | build: 16 | go build -v 17 | 18 | .PHONY: fmt 19 | fmt: 20 | gofmt -s -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor) 21 | 22 | .PHONY: test 23 | test: 24 | go test -v ./... 25 | 26 | .PHONY: cover 27 | cover: 28 | go test -v -cover -race -coverprofile=${COVERAGE} 29 | 30 | .PHONY: vet 31 | vet: 32 | go tool vet --all *.go 33 | 34 | .PHONY: test-all 35 | test-all: vet test 36 | -------------------------------------------------------------------------------- /action.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | type StateAction func(state *State, idx int) 4 | 5 | // AttachLeft は左側の単語を右側の単語の親にします 6 | func AttachLeft(state *State, idx int) { 7 | parent := state.pending[idx] 8 | child := state.pending[idx+1] 9 | 10 | state.deletePending(idx + 1) 11 | parent.appendChild(child) 12 | state.arcs[child.idx] = parent.idx 13 | } 14 | 15 | // AttachRight は右側の単語を左側の単語の親にします 16 | func AttachRight(state *State, idx int) { 17 | parent := state.pending[idx+1] 18 | child := state.pending[idx] 19 | 20 | state.deletePending(idx) 21 | parent.prependChild(child) 22 | state.arcs[child.idx] = parent.idx 23 | } 24 | 25 | // StateActions はActionの集合です 26 | var StateActions = []StateAction{AttachLeft, AttachRight} 27 | -------------------------------------------------------------------------------- /state_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDeletePending(t *testing.T) { 8 | words := make([]*Word, 0) 9 | words = append(words, 10 | makeRootWord(), 11 | makeWord("ms.", "NNP", 0, -1), 12 | makeWord("hang", "NNP", 1, 0), 13 | makeWord("plays", "VBZ", 2, 1), 14 | ) 15 | s := NewState(words) 16 | s.deletePending(2) 17 | 18 | if s.pending[1].surface != "ms." { 19 | t.Error("surface must be 'ms.'") 20 | } 21 | if s.pending[2].surface != "plays" { 22 | t.Error("surface must be 'plays'") 23 | } 24 | 25 | s.deletePending(1) 26 | if s.pending[1].surface != "plays" { 27 | t.Error("surface must be 'plays'") 28 | } 29 | 30 | if words[1].surface != "ms." { 31 | t.Error("surface is wrong!!!" + words[1].surface) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /word.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | type Word struct { 4 | surface string 5 | lemma string 6 | posTag string 7 | cposTag string 8 | idx int 9 | head int 10 | predHead int 11 | children []Word 12 | } 13 | 14 | func makeWord(surface string, posTag string, idx int, head int) *Word { 15 | return &Word{surface, surface, posTag, posTag, idx, head, head, make([]Word, 0)} 16 | } 17 | 18 | func makeRootWord() *Word { 19 | return makeWord("*ROOT*", "*ROOT*", 0, -1) 20 | } 21 | 22 | func (word *Word) appendChild(c *Word) []Word { 23 | word.children = append(word.children, *c) 24 | return word.children 25 | } 26 | 27 | func (word *Word) prependChild(c *Word) []Word { 28 | word.children = append([]Word{*c}, word.children...) 29 | return word.children 30 | } 31 | 32 | func (word *Word) LeftMostChild() *Word { 33 | if len(word.children) == 0 { 34 | return nil 35 | } else { 36 | return &word.children[0] 37 | } 38 | } 39 | 40 | func (word *Word) RightMostChild() *Word { 41 | if len(word.children) == 0 { 42 | return nil 43 | } else { 44 | return &word.children[len(word.children)-1] 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Yasuhisa Yoshida 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /evaluation.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "runtime" 6 | "sync" 7 | ) 8 | 9 | func dependencyAccuracy(golds [][]int, predictions [][]int) (float64, error) { 10 | if len(golds) != len(predictions) { 11 | return 0.0, errors.New("length of golds and that of predictions is not same") 12 | } 13 | sum := 0.0 14 | count := 0.0 15 | for idx, gold := range golds { 16 | pred := predictions[idx] 17 | if len(gold) != len(pred) { 18 | return 0.0, errors.New("length of gold and that of pred is not same") 19 | } 20 | for i, g := range gold { 21 | if g == pred[i] { 22 | sum += 1.0 23 | } 24 | count += 1.0 25 | } 26 | } 27 | return sum / count, nil 28 | } 29 | 30 | func DependencyAccuracy(w *[]float64, sents []*Sentence) float64 { 31 | wg := &sync.WaitGroup{} 32 | goldHeads := make([][]int, 0) 33 | for _, sent := range sents { 34 | goldHeads = append(goldHeads, sent.ExtractHeads()) 35 | } 36 | 37 | predHeads := make([][]int, 0) 38 | 39 | cpus := runtime.NumCPU() 40 | semaphore := make(chan int, cpus) 41 | for _, sent := range sents { 42 | wg.Add(1) 43 | go func(sent *Sentence) { 44 | defer wg.Done() 45 | semaphore <- 1 46 | Decode(w, sent) 47 | <-semaphore 48 | }(sent) 49 | } 50 | wg.Wait() 51 | 52 | for _, sent := range sents { 53 | predHeads = append(predHeads, sent.ExtractPredictedHeads()) 54 | } 55 | accuracy, _ := dependencyAccuracy(goldHeads, predHeads) 56 | return accuracy 57 | } 58 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 2 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= 3 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= 4 | github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= 5 | github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= 6 | github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= 7 | github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= 8 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 9 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 10 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= 11 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 12 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= 13 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= 14 | github.com/urfave/cli v1.22.5 h1:lNq9sAHXK2qfdI8W+GRItjCEkI+2oR4d+MEHy1CKXoU= 15 | github.com/urfave/cli v1.22.5/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= 16 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 17 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 18 | -------------------------------------------------------------------------------- /state.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "math" 5 | "reflect" 6 | "runtime" 7 | "strconv" 8 | ) 9 | 10 | type FvCache map[string][]int 11 | 12 | type State struct { 13 | pending []*Word 14 | arcs map[int]int 15 | fvCache FvCache 16 | } 17 | 18 | func (state *State) cacheKeyStr(pair ActionIndexPair) string { 19 | funcName := runtime.FuncForPC(reflect.ValueOf(pair.action).Pointer()).Name() 20 | left := state.pending[pair.index] 21 | right := state.pending[pair.index+1] 22 | return funcName + ":" + strconv.Itoa(left.idx) + "-" + strconv.Itoa(right.idx) 23 | } 24 | 25 | func (state *State) InitFvCache() { 26 | for _, f := range StateActions { 27 | for idx := 0; idx < len(state.pending)-1; idx++ { 28 | pair := ActionIndexPair{f, idx} 29 | fv := ExtractFeatures(state, pair) 30 | state.fvCache[state.cacheKeyStr(pair)] = fv 31 | } 32 | } 33 | } 34 | 35 | func NewState(pending []*Word) *State { 36 | for _, w := range pending { 37 | w.children = make([]Word, 0) 38 | } 39 | p := make([]*Word, len(pending)) 40 | copy(p, pending) 41 | state := State{p, make(map[int]int), FvCache{}} 42 | state.InitFvCache() 43 | return &state 44 | } 45 | 46 | func (state *State) deletePending(idx int) []*Word { 47 | state.pending = append(state.pending[:idx], state.pending[idx+1:]...) 48 | return state.pending 49 | } 50 | 51 | func (state *State) ResetFvCache(index int) { 52 | for _, f := range StateActions { 53 | min := int(math.Max(0, float64(index-3))) 54 | max := int(math.Min(float64(len(state.pending)-1), float64(index+3))) 55 | for idx := min; idx < max; idx++ { 56 | pair := ActionIndexPair{f, idx} 57 | delete(state.fvCache, state.cacheKeyStr(pair)) 58 | } 59 | } 60 | } 61 | 62 | func (state *State) GetFvCache(pair ActionIndexPair) []int { 63 | key := state.cacheKeyStr(pair) 64 | if fv, ok := state.fvCache[key]; ok { 65 | return fv 66 | } else { 67 | fv = ExtractFeatures(state, pair) 68 | state.fvCache[key] = fv 69 | return fv 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /action_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestAttachLeft(t *testing.T) { 8 | words := make([]*Word, 0) 9 | words = append(words, 10 | makeRootWord(), 11 | makeWord("ms.", "NNP", 1, 2), 12 | makeWord("hang", "NNP", 2, 3), 13 | makeWord("plays", "VBZ", 3, 0), 14 | makeWord("elianti", "NNP", 4, 3), 15 | makeWord(".", ".", 5, 3), 16 | ) 17 | s := NewState(words) 18 | AttachLeft(s, 3) 19 | p, ok := s.arcs[4] 20 | if !ok || p != 3 { 21 | t.Error("parent's index must be 3") 22 | } 23 | 24 | AttachLeft(s, 3) 25 | p, ok = s.arcs[5] 26 | if !ok || p != 3 { 27 | t.Error("parent's index must be 3") 28 | } 29 | 30 | if len(s.pending) != 4 { 31 | t.Error("length of pending must be 4") 32 | } 33 | } 34 | 35 | func TestAttachRight(t *testing.T) { 36 | words := make([]*Word, 0) 37 | words = append(words, 38 | makeRootWord(), 39 | makeWord("ms.", "NNP", 1, 2), 40 | makeWord("hang", "NNP", 2, 3), 41 | makeWord("plays", "VBZ", 3, 0), 42 | makeWord("elianti", "NNP", 4, 3), 43 | makeWord(".", ".", 5, 3), 44 | ) 45 | s := NewState(words) 46 | AttachRight(s, 3) 47 | p, ok := s.arcs[3] 48 | if !ok || p != 4 { 49 | t.Error("parent's index must be 4") 50 | } 51 | 52 | AttachRight(s, 3) 53 | p, ok = s.arcs[4] 54 | if !ok || p != 5 { 55 | t.Error("parent's index must be 5") 56 | } 57 | 58 | if len(s.pending) != 4 { 59 | t.Error("length of pending must be 4") 60 | } 61 | } 62 | 63 | func TestAttachLeftAll(t *testing.T) { 64 | words := make([]*Word, 0) 65 | words = append(words, 66 | makeRootWord(), 67 | makeWord("ms.", "NNP", 1, 2), 68 | makeWord("hang", "NNP", 2, 3), 69 | makeWord("plays", "VBZ", 3, 0), 70 | makeWord("elianti", "NNP", 4, 3), 71 | makeWord(".", ".", 5, 3), 72 | ) 73 | s := NewState(words) 74 | AttachLeft(s, 0) 75 | AttachLeft(s, 0) 76 | AttachLeft(s, 0) 77 | AttachLeft(s, 0) 78 | AttachLeft(s, 0) 79 | if words[1].surface != "ms." { 80 | t.Error("surface is wrong") 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /reader.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/gob" 6 | "errors" 7 | "io/ioutil" 8 | "os" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | func makeSentence(s string) (*Sentence, error) { 14 | lines := strings.Split(s, "\n") 15 | if len(lines) < 4 { 16 | return nil, errors.New("Invalid line") 17 | } 18 | words := strings.Split(strings.TrimSpace(lines[0]), "\t") 19 | posTags := strings.Split(strings.TrimSpace(lines[1]), "\t") 20 | heads := strings.Split(strings.TrimSpace(lines[3]), "\t") 21 | 22 | sent := make([]*Word, 0) 23 | sent = append(sent, makeRootWord()) 24 | for i := 0; i < len(words); i++ { 25 | head, err := strconv.ParseInt(heads[i], 10, 0) 26 | if err != nil { 27 | return nil, err 28 | } 29 | sent = append(sent, makeWord(words[i], posTags[i], i+1, int(head))) 30 | } 31 | return &Sentence{sent}, nil 32 | } 33 | 34 | func splitBySentence(s string) []string { 35 | return strings.Split(s, "\n\n") 36 | } 37 | 38 | func ReadData(filename string) ([]*Sentence, error) { 39 | file, err := os.Open(filename) 40 | if err != nil { 41 | return nil, err 42 | } 43 | 44 | data, err := ioutil.ReadAll(bufio.NewReader(file)) 45 | if err != nil { 46 | return nil, err 47 | } 48 | 49 | sentences := make([]*Sentence, 0) 50 | for _, sent := range splitBySentence(string(data)) { 51 | s, err := makeSentence(sent) 52 | if err != nil { 53 | break 54 | } 55 | sentences = append(sentences, s) 56 | } 57 | return sentences, nil 58 | } 59 | 60 | func SaveModel(weight *[]float64, filename string) error { 61 | file, err := os.Create(filename) 62 | defer file.Close() 63 | if err != nil { 64 | return err 65 | } 66 | 67 | enc := gob.NewEncoder(file) 68 | enc.Encode(&weight) 69 | return nil 70 | } 71 | 72 | func LoadModel(filename string) (*[]float64, error) { 73 | var w []float64 74 | file, err := os.Open(filename) 75 | defer file.Close() 76 | if err != nil { 77 | return nil, err 78 | } 79 | 80 | decoder := gob.NewDecoder(file) 81 | decoder.Decode(&w) 82 | return &w, nil 83 | } 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-easy-first 2 | [![CircleCI](https://circleci.com/gh/syou6162/go-easy-first.svg?style=shield)](https://circleci.com/gh/syou6162/go-easy-first) 3 | [![Go Report Card](https://goreportcard.com/badge/github.com/syou6162/go-easy-first)](https://goreportcard.com/report/github.com/syou6162/go-easy-first) 4 | [![Coverage Status](https://coveralls.io/repos/github/syou6162/go-easy-first/badge.svg?branch=coveralls)](https://coveralls.io/github/syou6162/go-easy-first?branch=coveralls) 5 | 6 | go-easy-first - Dependency Parser with Easy-First Algorithm (An Efficient Algorithm for Easy-First Non-Directional Dependency Parsing, NAACL-2010, Yoav Goldberg and Michael Elhadad) written in Go. 7 | 8 | # Build from source 9 | 10 | ```sh 11 | % git clone https://github.com/syou6162/go-easy-first.git 12 | % cd go-easy-first 13 | % make deps && make bindata && make build 14 | ``` 15 | 16 | # Usage 17 | go-easy-first has `train` (training a parser phase) and `eval` (evaluating a trained parser phase) modes. To see the detail options, type `./go-easy-first --help`. 18 | 19 | ## Training a parser 20 | To see the detail options, type `./go-easy-first train --help`. 21 | 22 | ```sh 23 | % ./go-easy-first train --train-filename path/to/train.txt --dev-filename path/to/dev.txt --max-iter 10 --model-filename model.bin 24 | 0, 0.907, 0.893 25 | 1, 0.920, 0.901 26 | 2, 0.929, 0.904 27 | 3, 0.935, 0.906 28 | 4, 0.940, 0.907 29 | 5, 0.944, 0.907 30 | 6, 0.947, 0.908 31 | 7, 0.950, 0.908 32 | 8, 0.953, 0.908 33 | 9, 0.955, 0.908 34 | ``` 35 | 36 | ## Evaluating a trained parser 37 | To see the detail options, type `./go-easy-first eval --help`. 38 | 39 | ```sh 40 | % ./go-easy-first eval --test-filename path/to/test.txt --model-filename model.bin 41 | | SENTENCES | SECONDS | ACCURACY | 42 | |-----------|---------|----------| 43 | | 1346 | 4.60 | 0.888 | 44 | ``` 45 | 46 | ## [Experimental] Obtain a single binary embedded model parameters 47 | 48 | ```sh 49 | % ./go-easy-first train --train-filename path/to/train.txt --dev-filename path/to/dev.txt --max-iter 10 --model-filename data/model.bin 50 | % make bindata && make build 51 | % ./go-easy-first decode --test-filename path/to/test.txt 52 | ``` 53 | 54 | # Roadmap 55 | - [ ] Implement PP-Attachment features 56 | - [ ] Beam search with max-violation perceptron 57 | - [ ] Mini-batch update 58 | - [x] Embed weight parameters to a built binary file using go-bindata 59 | 60 | # Author 61 | Yasuhisa Yoshida 62 | -------------------------------------------------------------------------------- /perceptron_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestEdgeFor(t *testing.T) { 9 | words := make([]*Word, 0) 10 | words = append(words, 11 | makeWord("ms.", "NNP", 0, -1), 12 | makeWord("hang", "NNP", 1, 0), 13 | makeWord("plays", "VBZ", 2, 1), 14 | ) 15 | s := NewState(words) 16 | pair, err := EdgeFor(s, 0, 0) 17 | if err != nil { 18 | t.Error("error should be nil") 19 | } 20 | if !reflect.DeepEqual(pair, []int{0, 1}) { 21 | t.Error("pair shoud be [0, 1] but: ", pair) 22 | } 23 | } 24 | 25 | func TestIsValidFalse(t *testing.T) { 26 | words := make([]*Word, 0) 27 | words = append(words, 28 | makeWord("ms.", "NNP", 0, -1), 29 | makeWord("hang", "NNP", 1, 0), 30 | makeWord("plays", "VBZ", 2, 1), 31 | ) 32 | s := NewState(words) 33 | goldArcs := make(map[int][]int) 34 | goldArcs[-1] = []int{0} 35 | goldArcs[0] = []int{1} 36 | goldArcs[1] = []int{2} 37 | if IsValid(s, 0, 0, goldArcs) != false { 38 | t.Error("should return false") 39 | } 40 | } 41 | 42 | func TestIsValidTrue(t *testing.T) { 43 | words := make([]*Word, 0) 44 | words = append(words, 45 | makeWord("ms.", "NNP", 0, -1), 46 | makeWord("hang", "NNP", 1, 0), 47 | makeWord("plays", "VBZ", 2, 1), 48 | ) 49 | 50 | s := NewState(words) 51 | arcs := make(map[int]int) 52 | arcs[2] = 1 53 | s.arcs = arcs 54 | goldArcs := make(map[int][]int) 55 | goldArcs[-1] = []int{0} 56 | goldArcs[0] = []int{1} 57 | goldArcs[1] = []int{2} 58 | if IsValid(s, 0, 0, goldArcs) != true { 59 | t.Error("should return true") 60 | } 61 | } 62 | 63 | func TestAllowedActions(t *testing.T) { 64 | words := make([]*Word, 0) 65 | words = append(words, 66 | makeRootWord(), 67 | makeWord("ms.", "NNP", 1, 2), 68 | makeWord("hang", "NNP", 2, 3), 69 | makeWord("plays", "VBZ", 3, 0), 70 | makeWord("elianti", "NNP", 4, 3), 71 | makeWord(".", ".", 5, 3), 72 | ) 73 | s := NewState(words) 74 | AttachRight(s, 3) 75 | 76 | goldArcs := make(map[int][]int) 77 | goldArcs[-1] = []int{0} 78 | goldArcs[0] = []int{1} 79 | goldArcs[1] = []int{2} 80 | 81 | if 1 != len(AllowedActions(s, goldArcs)) { 82 | t.Error("length of allowed actions must be 1") 83 | } 84 | } 85 | 86 | func TestCandidateActions(t *testing.T) { 87 | words := make([]*Word, 0) 88 | words = append(words, 89 | makeRootWord(), 90 | makeWord("ms.", "NNP", 1, 2), 91 | makeWord("hang", "NNP", 2, 3), 92 | makeWord("plays", "VBZ", 3, 0), 93 | makeWord("elianti", "NNP", 4, 3), 94 | makeWord(".", ".", 5, 3), 95 | ) 96 | s := NewState(words) 97 | 98 | if 10 != len(CandidateActions(s)) { 99 | t.Error("length of candidate actions must be 10") 100 | } 101 | } 102 | 103 | func TestUpdateWeight(t *testing.T) { 104 | model := NewModel() 105 | gold := []int{1, 2, 3} 106 | predict := []int{1, 3, 4} 107 | model.updateWeight(&gold, &predict) 108 | 109 | if w := model.weight[1]; w != 0 { 110 | t.Error("weight of '1' must be 0") 111 | } 112 | if w := model.weight[2]; w != 1 { 113 | t.Error("weight of '2' must be 1") 114 | } 115 | if w := model.weight[3]; w != 0 { 116 | t.Error("weight of '3' must be 0") 117 | } 118 | if w := model.weight[4]; w != -1 { 119 | t.Error("weight of '4' must be -1") 120 | } 121 | 122 | model.updateWeight(&gold, &predict) 123 | 124 | if w := model.cumWeight[1]; w != 0 { 125 | t.Error("cumWeight of '1' must be 0") 126 | } 127 | if w := model.cumWeight[2]; w != 3 { 128 | t.Error("cumWeight of '2' must be 3") 129 | } 130 | if w := model.cumWeight[3]; w != 0 { 131 | t.Error("cumWeight of '3' must be 0") 132 | } 133 | if w := model.cumWeight[4]; w != -3 { 134 | t.Error("cumWeight of '4' must be -3") 135 | } 136 | } 137 | 138 | func TestUpdate(t *testing.T) { 139 | words := make([]*Word, 0) 140 | words = append(words, 141 | makeRootWord(), 142 | makeWord("ms.", "NNP", 1, 2), 143 | makeWord("hang", "NNP", 2, 3), 144 | makeWord("plays", "VBZ", 3, 0), 145 | makeWord("elianti", "NNP", 4, 3), 146 | makeWord(".", ".", 5, 3), 147 | ) 148 | sent := Sentence{words: words} 149 | model := NewModel() 150 | model.Update(&sent) 151 | if model.count == 1 { 152 | t.Error("count must be greater than 1") 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "fmt" 7 | "math/rand" 8 | "os" 9 | "runtime" 10 | "time" 11 | 12 | "github.com/olekukonko/tablewriter" 13 | "github.com/urfave/cli" 14 | ) 15 | 16 | func shuffle(data []*Sentence) { 17 | n := len(data) 18 | for i := n - 1; i >= 0; i-- { 19 | j := rand.Intn(i + 1) 20 | data[i], data[j] = data[j], data[i] 21 | } 22 | } 23 | 24 | func printEvaluation(data [][]string) { 25 | table := tablewriter.NewWriter(os.Stdout) 26 | table.SetHeader([]string{"Sentences", "Seconds", "Accuracy"}) 27 | table.SetBorders(tablewriter.Border{Left: true, Top: false, Right: true, Bottom: false}) 28 | table.SetCenterSeparator("|") 29 | table.AppendBulk(data) // Add Bulk Data 30 | table.Render() 31 | } 32 | 33 | var commandTrain = cli.Command{ 34 | Name: "train", 35 | Usage: "Train a parsing model by easy-first algorithm", 36 | Description: ` 37 | Train a parsing model by easy-first algorithm. 38 | `, 39 | Action: doTrain, 40 | Flags: []cli.Flag{ 41 | cli.StringFlag{Name: "train-filename"}, 42 | cli.StringFlag{Name: "dev-filename"}, 43 | cli.StringFlag{Name: "model-filename"}, 44 | cli.IntFlag{Name: "max-iter", Value: 10}, 45 | }, 46 | } 47 | 48 | var commandEval = cli.Command{ 49 | Name: "eval", 50 | Usage: "Evaluate a parsing model by easy-first algorithm", 51 | Description: ` 52 | Evaluate a parsing model by easy-first algorithm. 53 | `, 54 | Action: doEval, 55 | Flags: []cli.Flag{ 56 | cli.StringFlag{Name: "test-filename"}, 57 | cli.StringFlag{Name: "model-filename"}, 58 | }, 59 | } 60 | 61 | // This is an experimental feature 62 | var commandDecode = cli.Command{ 63 | Name: "decode", 64 | Usage: "Decode a sentence with an embeded model", 65 | Description: ` 66 | Decode a sentence with an embeded model. 67 | `, 68 | Action: doDecode, 69 | Flags: []cli.Flag{ 70 | cli.StringFlag{Name: "test-filename"}, 71 | }, 72 | } 73 | 74 | var Commands = []cli.Command{ 75 | commandTrain, 76 | commandEval, 77 | commandDecode, 78 | } 79 | 80 | func doTrain(c *cli.Context) error { 81 | trainFilename := c.String("train-filename") 82 | devFilename := c.String("dev-filename") 83 | modelFilename := c.String("model-filename") 84 | maxIter := c.Int("max-iter") 85 | 86 | if trainFilename == "" { 87 | _ = cli.ShowCommandHelp(c, "train") 88 | return cli.NewExitError("`train-filename` is a required field to train a parser.", 1) 89 | } 90 | 91 | if devFilename == "" { 92 | _ = cli.ShowCommandHelp(c, "train") 93 | return cli.NewExitError("`dev-filename` is a required field to train a parser.", 1) 94 | } 95 | 96 | if modelFilename == "" { 97 | _ = cli.ShowCommandHelp(c, "train") 98 | return cli.NewExitError("`model-filename` is a required field to train a parser.", 1) 99 | } 100 | 101 | goldSents, _ := ReadData(trainFilename) 102 | devSents, _ := ReadData(devFilename) 103 | 104 | model := NewModel() 105 | for iter := 0; iter < maxIter; iter++ { 106 | shuffle(goldSents) 107 | for _, sent := range goldSents { 108 | model.Update(sent) 109 | } 110 | w := model.AveragedWeight() 111 | trainAccuracy := DependencyAccuracy(&w, goldSents) 112 | devAccuracy := DependencyAccuracy(&w, devSents) 113 | fmt.Println(fmt.Sprintf("%d, %0.03f, %0.03f", iter, trainAccuracy, devAccuracy)) 114 | } 115 | 116 | w := model.AveragedWeight() 117 | SaveModel(&w, modelFilename) 118 | return nil 119 | } 120 | 121 | func doEval(c *cli.Context) error { 122 | testFilename := c.String("test-filename") 123 | modelFilename := c.String("model-filename") 124 | 125 | if testFilename == "" { 126 | _ = cli.ShowCommandHelp(c, "eval") 127 | return cli.NewExitError("`test-filename` is a required field to evaluate a parser.", 1) 128 | } 129 | 130 | if modelFilename == "" { 131 | _ = cli.ShowCommandHelp(c, "eval") 132 | return cli.NewExitError("`model-filename` is a required field to evaluate a parser.", 1) 133 | } 134 | 135 | goldSents, _ := ReadData(testFilename) 136 | weight, _ := LoadModel(modelFilename) 137 | start := time.Now() 138 | testAccuracy := DependencyAccuracy(weight, goldSents) 139 | end := time.Now().Sub(start).Seconds() 140 | 141 | data := [][]string{ 142 | {fmt.Sprintf("%d", len(goldSents)), fmt.Sprintf("%0.02f", end), fmt.Sprintf("%0.03f", testAccuracy)}, 143 | } 144 | printEvaluation(data) 145 | return nil 146 | } 147 | 148 | func loadModel(filename string) (*[]float64, error) { 149 | var weight []float64 150 | var b bytes.Buffer 151 | tmp, err := Asset(filename) 152 | if err != nil { 153 | return nil, err 154 | } 155 | b.Write(tmp) 156 | 157 | decoder := gob.NewDecoder(&b) 158 | decoder.Decode(&weight) 159 | return &weight, nil 160 | } 161 | 162 | func doDecode(c *cli.Context) error { 163 | testFilename := c.String("test-filename") 164 | 165 | if testFilename == "" { 166 | _ = cli.ShowCommandHelp(c, "decode") 167 | return cli.NewExitError("`test-filename` is a required field to decode sentences.", 1) 168 | } 169 | 170 | goldSents, _ := ReadData(testFilename) 171 | 172 | weight, err := loadModel("data/model.bin") 173 | if err != nil { 174 | return err 175 | } 176 | 177 | start := time.Now() 178 | testAccuracy := DependencyAccuracy(weight, goldSents) 179 | end := time.Now().Sub(start).Seconds() 180 | 181 | data := [][]string{ 182 | {fmt.Sprintf("%d", len(goldSents)), fmt.Sprintf("%0.02f", end), fmt.Sprintf("%0.03f", testAccuracy)}, 183 | } 184 | printEvaluation(data) 185 | return nil 186 | } 187 | 188 | func main() { 189 | app := cli.NewApp() 190 | app.Name = "easy-first" 191 | app.Commands = Commands 192 | 193 | runtime.GOMAXPROCS(runtime.NumCPU()) 194 | 195 | app.Run(os.Args) 196 | } 197 | -------------------------------------------------------------------------------- /perceptron.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "math" 6 | "reflect" 7 | ) 8 | 9 | // GoldArcs returns map of parent => children 10 | func GoldArcs(sent *Sentence) map[int][]int { 11 | result := make(map[int][]int) 12 | for idx, w := range sent.words { 13 | head := w.head 14 | if children, ok := result[head]; ok { 15 | result[head] = append(children, idx) 16 | } else { 17 | result[head] = []int{idx} 18 | } 19 | } 20 | return result 21 | } 22 | 23 | // EdgeFor returns a pair of parent index and child index 24 | func EdgeFor(state *State, actionID int, idx int) ([]int, error) { 25 | switch actionID { 26 | case 0: 27 | return []int{state.pending[idx].idx, state.pending[idx+1].idx}, nil 28 | case 1: 29 | return []int{state.pending[idx+1].idx, state.pending[idx].idx}, nil 30 | default: 31 | return nil, errors.New("Invalid line") 32 | } 33 | } 34 | 35 | // IsValid returns the chosen action/location pair is valid 36 | func IsValid(state *State, actionID int, idx int, goldArcs map[int][]int) bool { 37 | pair, err := EdgeFor(state, actionID, idx) 38 | if err != nil { 39 | return false 40 | } 41 | pIdx := pair[0] 42 | cIdx := pair[1] 43 | containedInGoldArcs := false 44 | for _, i := range goldArcs[pIdx] { 45 | if cIdx == i { 46 | containedInGoldArcs = true 47 | break 48 | } 49 | } 50 | flag := false 51 | for _, cPrime := range goldArcs[cIdx] { 52 | if cIdx != state.arcs[cPrime] { 53 | flag = true 54 | break 55 | } 56 | } 57 | if !containedInGoldArcs || flag { 58 | return false 59 | } 60 | return true 61 | } 62 | 63 | type ActionIndexPair struct { 64 | action StateAction 65 | index int 66 | } 67 | 68 | func (pair1 ActionIndexPair) SameActionIndexPair(pair2 ActionIndexPair) bool { 69 | return pair1.index == pair2.index && 70 | reflect.ValueOf(pair1.action).Pointer() == reflect.ValueOf(pair2.action).Pointer() 71 | } 72 | 73 | func AllowedActions(state *State, goldArcs map[int][]int) []ActionIndexPair { 74 | result := make([]ActionIndexPair, 0) 75 | for actionID, f := range StateActions { 76 | for idx := 0; idx < len(state.pending)-1; idx++ { 77 | if IsValid(state, actionID, idx, goldArcs) { 78 | result = append(result, ActionIndexPair{f, idx}) 79 | } 80 | } 81 | } 82 | return result 83 | } 84 | 85 | func CandidateActions(state *State) []ActionIndexPair { 86 | result := make([]ActionIndexPair, 0) 87 | for _, f := range StateActions { 88 | for idx := 0; idx < len(state.pending)-1; idx++ { 89 | result = append(result, ActionIndexPair{f, idx}) 90 | } 91 | } 92 | return result 93 | } 94 | 95 | func DotProduct(weight *[]float64, fv []int) float64 { 96 | sum := 0.0 97 | for _, f := range fv { 98 | sum += (*weight)[f] 99 | } 100 | return sum 101 | } 102 | 103 | func BestActionIndexPair(weight *[]float64, state *State) ActionIndexPair { 104 | bestScore := math.Inf(-1) 105 | pairs := CandidateActions(state) 106 | bestPair := pairs[0] 107 | for _, pair := range pairs { 108 | fv := state.GetFvCache(pair) 109 | score := DotProduct(weight, fv) 110 | if score > bestScore { 111 | bestPair = pair 112 | bestScore = score 113 | } 114 | } 115 | return bestPair 116 | } 117 | 118 | func BestAllowedActionIndexPair(weight *[]float64, state *State, pairs []ActionIndexPair) ActionIndexPair { 119 | bestScore := math.Inf(-1) 120 | bestPair := pairs[0] 121 | for _, pair := range pairs { 122 | fv := state.GetFvCache(pair) 123 | score := DotProduct(weight, fv) 124 | if score > bestScore { 125 | bestPair = pair 126 | bestScore = score 127 | } 128 | } 129 | return bestPair 130 | } 131 | 132 | type Model struct { 133 | weight []float64 134 | cumWeight []float64 135 | count int 136 | } 137 | 138 | func NewModel() Model { 139 | return Model{make([]float64, MaxFeatureLength), make([]float64, MaxFeatureLength), 1} 140 | } 141 | 142 | func (model *Model) updateWeight(goldFeatureVector *[]int, predictFeatureVector *[]int) { 143 | for _, feat := range *goldFeatureVector { 144 | w := model.weight[feat] 145 | cumW := model.cumWeight[feat] 146 | model.weight[feat] = w + 1.0 147 | model.cumWeight[feat] = cumW + float64(model.count) 148 | } 149 | for _, feat := range *predictFeatureVector { 150 | w := model.weight[feat] 151 | cumW := model.cumWeight[feat] 152 | model.weight[feat] = w - 1.0 153 | model.cumWeight[feat] = cumW - float64(model.count) 154 | } 155 | model.count += 1 156 | } 157 | 158 | func (model *Model) Update(gold *Sentence) { 159 | state := NewState(gold.words) 160 | goldArcs := GoldArcs(gold) 161 | iter := 0 162 | for { 163 | if len(state.pending) <= 1 { 164 | break 165 | } 166 | allow := AllowedActions(state, goldArcs) 167 | choice := BestActionIndexPair(&model.weight, state) 168 | containChoice := false 169 | for _, pair := range allow { 170 | if pair.SameActionIndexPair(choice) { 171 | containChoice = true 172 | } 173 | } 174 | if containChoice { 175 | choice.action(state, choice.index) 176 | state.ResetFvCache(choice.index) 177 | } else { 178 | predFv := state.GetFvCache(choice) 179 | good := BestAllowedActionIndexPair(&model.weight, state, allow) 180 | goodFv := state.GetFvCache(good) 181 | model.updateWeight(&goodFv, &predFv) 182 | } 183 | iter++ 184 | if iter > 500 { // for infinite loop 185 | break 186 | } 187 | } 188 | } 189 | 190 | // w_t - w_cum / t 191 | func (model *Model) AveragedWeight() []float64 { 192 | avg := make([]float64, MaxFeatureLength) 193 | for k, v := range model.weight { 194 | avg[k] = v 195 | } 196 | for k, v := range model.cumWeight { 197 | avg[k] = avg[k] - v/float64(model.count) 198 | } 199 | return avg 200 | } 201 | -------------------------------------------------------------------------------- /feature.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "math" 5 | "reflect" 6 | "runtime" 7 | "strconv" 8 | ) 9 | 10 | func NilSafePosTag(w *Word) string { 11 | posTag := "" 12 | if w != nil { 13 | posTag = w.posTag 14 | } 15 | return posTag 16 | } 17 | 18 | func NilSafePendingWord(state *State, idx int) *Word { 19 | if idx < 0 || idx >= len(state.pending) { 20 | return nil 21 | } else { 22 | return state.pending[idx] 23 | } 24 | } 25 | 26 | func addUnigramFeatures(features *[]int, state *State, actName string, idx int, prefix string) { 27 | if idx < 0 || idx >= len(state.pending) { 28 | return 29 | } 30 | w := state.pending[idx] 31 | lcp := NilSafePosTag(w.LeftMostChild()) 32 | rcp := NilSafePosTag(w.RightMostChild()) 33 | *features = append(*features, 34 | JenkinsHash(actName+"+"+prefix+"+surface:"+w.surface), 35 | JenkinsHash(actName+"+"+prefix+"+lemma:"+w.lemma), 36 | JenkinsHash(actName+"+"+prefix+"+posTag:"+w.posTag), 37 | JenkinsHash(actName+"+"+prefix+"+cposTag:"+w.cposTag), 38 | JenkinsHash(actName+"+"+prefix+"+posTag:"+w.posTag+"+leftmost:"+lcp), 39 | JenkinsHash(actName+"+"+prefix+"+posTag:"+w.posTag+"+rightmost:"+rcp), 40 | JenkinsHash(actName+"+"+prefix+"+posTag:"+w.posTag+"+leftmost:"+lcp+"+rightmost:"+rcp), 41 | ) 42 | } 43 | 44 | func distStr(dist int) string { 45 | d := "0" 46 | switch dist { 47 | case 1: 48 | d = "1" 49 | case 2: 50 | d = "2" 51 | case 3: 52 | d = "3" 53 | case 4: 54 | d = "4" 55 | default: 56 | d = "5" 57 | } 58 | return d 59 | } 60 | 61 | func AddBigramFeatures(features *[]int, actName string, parent *Word, child *Word, prefix string) { 62 | if parent == nil || child == nil { 63 | return 64 | } 65 | 66 | plcp := NilSafePosTag(parent.LeftMostChild()) 67 | prcp := NilSafePosTag(parent.RightMostChild()) 68 | clcp := NilSafePosTag(child.LeftMostChild()) 69 | crcp := NilSafePosTag(child.RightMostChild()) 70 | 71 | *features = append(*features, 72 | JenkinsHash(actName+"+"+prefix+"+parent-surface:"+parent.surface+"+child-surface:"+child.surface), 73 | JenkinsHash(actName+"+"+prefix+"+parent-surface:"+parent.surface+"+child-posTag:"+child.posTag), 74 | JenkinsHash(actName+"+"+prefix+"+parent-posTag:"+parent.posTag+"+child-surface:"+child.surface), 75 | JenkinsHash(actName+"+"+prefix+"+parent-lemma:"+parent.lemma+"+child-lemma:"+child.lemma), 76 | JenkinsHash(actName+"+"+prefix+"+parent-posTag:"+parent.posTag+"+child-posTag:"+child.posTag), 77 | JenkinsHash(actName+"+"+prefix+"+parent-cposTag:"+parent.cposTag+"+child-cposTag:"+child.cposTag), 78 | JenkinsHash(actName+"+"+prefix+"+parent-posTag:"+parent.posTag+"+child-posTag:"+child.posTag+"+plcp:"+plcp+"+prcp:"+prcp), 79 | JenkinsHash(actName+"+"+prefix+"+parent-posTag:"+parent.posTag+"+child-posTag:"+child.posTag+"+plcp:"+plcp+"+crcp:"+crcp), 80 | JenkinsHash(actName+"+"+prefix+"+parent-posTag:"+parent.posTag+"+child-posTag:"+child.posTag+"+clcp:"+clcp+"+prcp:"+prcp), 81 | JenkinsHash(actName+"+"+prefix+"+parent-posTag:"+parent.posTag+"+child-posTag:"+child.posTag+"+clcp:"+clcp+"+crcp:"+crcp), 82 | ) 83 | } 84 | 85 | func AddUnigramFeatures(features *[]int, state *State, actName string, idx int) { 86 | addUnigramFeatures(features, state, actName, idx-2, "p_i-2") 87 | addUnigramFeatures(features, state, actName, idx-1, "p_i-1") 88 | addUnigramFeatures(features, state, actName, idx, "p_i") 89 | addUnigramFeatures(features, state, actName, idx+1, "p_i+1") 90 | addUnigramFeatures(features, state, actName, idx+2, "p_i+2") 91 | addUnigramFeatures(features, state, actName, idx+3, "p_i+3") 92 | } 93 | 94 | func hasNoChildren(w *Word) bool { 95 | return len(w.children) == 0 96 | } 97 | 98 | func addStructuralSingleFeatures(features *[]int, state *State, actName string, idx int, prefix string) { 99 | if idx < 0 || idx >= len(state.pending) { 100 | return 101 | } 102 | w := state.pending[idx] 103 | *features = append(*features, 104 | JenkinsHash(actName+"+"+prefix+"+len:"+strconv.Itoa(len(w.children))), 105 | JenkinsHash(actName+"+"+prefix+"+no-children:"+strconv.FormatBool(hasNoChildren(w))), 106 | ) 107 | } 108 | 109 | func AddStructuralSingleFeatures(features *[]int, state *State, actName string, idx int) { 110 | addStructuralSingleFeatures(features, state, actName, idx-2, "p_i-2") 111 | addStructuralSingleFeatures(features, state, actName, idx-1, "p_i-1") 112 | addStructuralSingleFeatures(features, state, actName, idx, "p_i") 113 | addStructuralSingleFeatures(features, state, actName, idx+1, "p_i+1") 114 | addStructuralSingleFeatures(features, state, actName, idx+2, "p_i+2") 115 | addStructuralSingleFeatures(features, state, actName, idx+3, "p_i+3") 116 | } 117 | 118 | func addStructuralPairFeatures(features *[]int, actName string, left *Word, right *Word, prefix string) { 119 | if left == nil || right == nil { 120 | return 121 | } 122 | dist := int(math.Abs(float64(left.idx - right.idx))) 123 | 124 | *features = append(*features, 125 | JenkinsHash(actName+"+"+prefix+"+dist:"+distStr(dist)), 126 | JenkinsHash(actName+"+"+prefix+"+dist:"+distStr(dist)+"+leftPos:"+left.posTag+"+rightPos:"+right.posTag), 127 | ) 128 | } 129 | 130 | func extractFeatures(state *State, actName string, idx int) []int { 131 | features := make([]int, 0) 132 | AddUnigramFeatures(&features, state, actName, idx) 133 | AddStructuralSingleFeatures(&features, state, actName, idx) 134 | 135 | p0 := NilSafePendingWord(state, idx-1) 136 | p1 := NilSafePendingWord(state, idx) 137 | p2 := NilSafePendingWord(state, idx+1) 138 | p3 := NilSafePendingWord(state, idx+2) 139 | 140 | AddBigramFeatures(&features, actName, p1, p2, "p_i+p_{i+1}") 141 | AddBigramFeatures(&features, actName, p1, p3, "p_i+p_{i+2}") 142 | AddBigramFeatures(&features, actName, p0, p1, "p_{i-1}+p_i") 143 | AddBigramFeatures(&features, actName, p0, p3, "p_{i-1}+p_{i+2}") 144 | AddBigramFeatures(&features, actName, p2, p3, "p_{i+1}+p_{i+2}") 145 | 146 | addStructuralPairFeatures(&features, actName, p1, p2, "p_i+p_{i+1}") 147 | addStructuralPairFeatures(&features, actName, p1, p3, "p_i+p_{i+2}") 148 | addStructuralPairFeatures(&features, actName, p0, p1, "p_{i-1}+p_i") 149 | addStructuralPairFeatures(&features, actName, p0, p3, "p_{i-1}+p_{i+2}") 150 | addStructuralPairFeatures(&features, actName, p2, p3, "p_{i+1}+p_{i+2}") 151 | 152 | return features 153 | } 154 | 155 | func mod(n, m int) int { 156 | if n < 0 { 157 | return (m - (-n % m)) % m 158 | } else { 159 | return n % m 160 | } 161 | } 162 | 163 | var MaxFeatureLength = 1000000 164 | 165 | func JenkinsHash(s string) int { 166 | hash := 0 167 | for _, b := range []byte(s) { 168 | hash += int(b) 169 | hash += hash << 10 170 | hash ^= hash >> 6 171 | } 172 | 173 | hash += hash << 3 174 | hash ^= hash >> 11 175 | hash += hash << 15 176 | 177 | return mod(hash, MaxFeatureLength) 178 | } 179 | 180 | func ExtractFeatures(state *State, pair ActionIndexPair) []int { 181 | actName := runtime.FuncForPC(reflect.ValueOf(pair.action).Pointer()).Name() 182 | 183 | features := extractFeatures(state, actName, pair.index) 184 | return features 185 | } 186 | --------------------------------------------------------------------------------