├── .github ├── dependabot.yml └── workflows │ └── build.yaml ├── LICENSE ├── Makefile ├── README.md ├── dbconfig.yml ├── go.mod ├── go.sum ├── lib ├── add │ ├── add.go │ └── add_test.go ├── annotation │ ├── annotation.go │ ├── annotation_cli.go │ └── annotation_slack.go ├── classifier │ ├── mira.go │ └── mira_test.go ├── command │ └── command.go ├── diagnosis │ ├── diagnosis.go │ ├── feature_weight │ │ ├── feature_weight.go │ │ └── feature_weight_test.go │ └── label_conflict │ │ ├── label_conflict.go │ │ └── label_conflict_test.go ├── evaluation │ ├── evaluation.go │ └── evaluation_test.go ├── example │ └── example.go ├── feature │ ├── example │ │ ├── example.go │ │ └── example_test.go │ ├── feature.go │ └── tweet │ │ ├── tweet.go │ │ └── tweet_test.go ├── fetcher │ ├── fetcher.go │ └── fetcher_test.go ├── hatena_bookmark │ ├── hatena_bookmark.go │ └── hatena_bookmark_test.go ├── model │ ├── error.go │ ├── example.go │ ├── hatena_bookmark.go │ ├── label_type.go │ ├── recommendation.go │ ├── related_example.go │ └── tweet.go ├── related_example │ └── related_example.go ├── repository │ ├── example.go │ ├── example_test.go │ ├── hatena_bookmark.go │ ├── hatena_bookmark_test.go │ ├── mira.go │ ├── mira_test.go │ ├── recommendation.go │ ├── recommendation_test.go │ ├── related_example.go │ ├── related_example_test.go │ ├── repository.go │ ├── top_accessed_example.go │ ├── top_accessed_example_test.go │ ├── tweet.go │ └── tweet_test.go ├── service │ ├── example.go │ ├── example_test.go │ └── service.go ├── top_accessed_example │ └── top_accessed_example.go └── util │ ├── converter │ └── converter.go │ ├── file │ ├── file.go │ └── file_test.go │ ├── util.go │ └── util_test.go ├── main.go ├── migrations ├── 0.sql ├── 1.sql ├── 10.sql ├── 11.sql ├── 12.sql ├── 13.sql ├── 14.sql ├── 15.sql ├── 16.sql ├── 2.sql ├── 3.sql ├── 4.sql ├── 5.sql ├── 6.sql ├── 7.sql ├── 8.sql └── 9.sql ├── script └── create_database.sql └── tech_input_example.txt /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: gomod 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "20:00" 8 | open-pull-requests-limit: 10 9 | reviewers: 10 | - syou6162 11 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: build and test 2 | on: [push] 3 | 4 | jobs: 5 | build: 6 | name: build and test 7 | runs-on: ubuntu-latest 8 | services: 9 | postgres: 10 | image: postgres:9.6 11 | env: 12 | POSTGRES_USER: nobody 13 | POSTGRES_PASSWORD: nobody 14 | POSTGRES_DB: go-active-learning-test 15 | ports: 16 | - 5432:5432 17 | options: >- 18 | --health-cmd pg_isready 19 | --health-interval 10s 20 | --health-timeout 5s 21 | --health-retries 5 22 | --name postgres 23 | steps: 24 | - name: checkout 25 | uses: actions/checkout@v2 26 | - name: format 27 | run: test `gofmt -l $(git ls-files | grep -e '\.go$' | grep -v -e vendor) | wc -l` = 0 28 | - name: deps 29 | run: make deps 30 | - name: build 31 | run: make build 32 | - name: test 33 | run: | 34 | export GOPATH=$HOME/go 35 | export GOBIN=$(go env GOPATH)/bin 36 | export PATH=$PATH:$GOPATH 37 | export PATH=$PATH:$GOBIN 38 | sql-migrate up -env=test 39 | make cover 40 | goveralls -coverprofile=${COVERAGE} -service=circle-ci -repotoken=${{ secrets.COVERALLS_TOKEN }} 41 | env: 42 | POSTGRES_HOST: localhost 43 | POSTGRES_PORT: 5432 44 | POSTGRES_USER: nobody 45 | POSTGRES_PASSWORD: nobody 46 | COVERAGE: coverage.out 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Yasuhisa Yoshida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COVERAGE = coverage.out 2 | export GO111MODULE := on 3 | 4 | all: build 5 | 6 | .PHONY: deps 7 | deps: 8 | go mod download 9 | go get github.com/mattn/goveralls 10 | go get github.com/haya14busa/goverage 11 | go get github.com/rubenv/sql-migrate/sql-migrate 12 | 13 | .PHONY: build 14 | build: 15 | go build -v 16 | 17 | .PHONY: fmt 18 | fmt: 19 | gofmt -s -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor) 20 | goimports -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor) 21 | 22 | .PHONY: test 23 | test: 24 | DB_NAME=go-active-learning-test go test -v ./... -p 1 -count 1 25 | 26 | .PHONY: vet 27 | vet: 28 | go tool vet --all *.go 29 | 30 | .PHONY: test-all 31 | test-all: vet test 32 | 33 | .PHONY: cover 34 | cover: 35 | DB_NAME=go-active-learning-test goverage -parallel 1 -v -coverprofile=${COVERAGE} ./... 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-active-learning 2 | [![CircleCI](https://circleci.com/gh/syou6162/go-active-learning.svg?style=shield)](https://circleci.com/gh/syou6162/go-active-learning) 3 | [![Go Report Card](https://goreportcard.com/badge/github.com/syou6162/go-active-learning)](https://goreportcard.com/report/github.com/syou6162/go-active-learning) 4 | [![Coverage Status](https://coveralls.io/repos/github/syou6162/go-active-learning/badge.svg?branch=master)](https://coveralls.io/github/syou6162/go-active-learning?branch=master) 5 | 6 | go-active-learning is a command line annotation tool for binary classification problem written in Go. It uses simple active learning algorithm to reduce annotation time. 7 | 8 | # Install 9 | 10 | ```console 11 | % go get github.com/syou6162/go-active-learning 12 | ``` 13 | 14 | ## Build from source 15 | 16 | ```console 17 | % git clone https://github.com/syou6162/go-active-learning.git 18 | % cd go-active-learning 19 | % createdb go-active-learning 20 | % createdb go-active-learning-test 21 | % sql-migrate up -env=local 22 | % sql-migrate up -env=test 23 | % make build 24 | ``` 25 | 26 | # Usage 27 | go-active-learning has `annotate` (annotate new examples suggested by active learning) mode and `diagnose` (check label conflicts in training data) mode. To see the detail options, type `./go-active-learning --help`. 28 | 29 | ## Annotation model 30 | To see the detail options, type `./go-active-learning annotate --help`. 31 | 32 | ## Annotate new examples from command line interface 33 | To see the detail options, type `./go-active-learning annotate cli --help`. 34 | 35 | ```console 36 | % ./go-active-learning annotate cli --open-url 37 | Loading cache... 38 | Label this example (Score: 0.600): http://srdk.rakuten.jp/ (それどこ) 39 | 40 | p: Label this example as positive. 41 | n: Label this example as negative. 42 | s: Skip this example. 43 | h: Show this help. 44 | e: Exit. 45 | 46 | Label this example (Score: 1.000): http://srdk.rakuten.jp/ (それどこ) 47 | Labeled as negative 48 | ``` 49 | 50 | ## Annotate new examples from slack 51 | To see the detail options, type `./go-active-learning annotate cli --help`. To annotate new examples from slack, you need to create slack bot, and obtain token from [here](https://my.slack.com/services/new/bot). You can pass token via environmental variable (`SLACK_TOKEN`). 52 | 53 | ```console 54 | % export SLACK_TOKEN=xoxb-SLACK-TOKEN 55 | % ./go-active-learning annotate slack --filter-status-code-ok --channel CHANNEL_ID 56 | ``` 57 | 58 | ## Diagnosis model 59 | To see the detail options, type `./go-active-learning diagnose --help`. 60 | 61 | ### Diagnose training data 62 | This subcommand diagnoses label conflicts in training data. 'conflict' means that an annotated label is '-1/1', but a predicted label by model is '1/-1'. In the above example, `http://www3.nhk.or.jp/news/` is a conflict case ('Label' is -1, but 'Score' is positive). You may need to collect such news articles to train a good classifier. 63 | 64 | ```console 65 | % ./go-active-learning diagnose label-conflict 66 | Loading cache... 67 | Index Label Score URL Title 68 | 0 -1 0.491 http://www3.nhk.or.jp/news/ 69 | 1 1 0.491 http://blog.yuuk.io/ 70 | 2 1 0.491 http://www.yasuhisay.info/ 71 | 3 -1 -3.057 http://r.gnavi.co.jp/g-interview/ ぐるなび みんなのごはん 72 | 4 1 4.264 http://hakobe932.hatenablog.com/ hakobe-blog ♨ 73 | 5 -1 -7.151 http://suumo.jp/town/ SUUMOタウン 74 | 6 -1 -26.321 https://www.facebook.com/ ログイン (日本語) 75 | 7 1 44.642 http://www.songmu.jp/riji/ おそらくはそれさえも平凡な日々 76 | 8 1 121.170 http://motemen.hatenablog.com/ 詩と創作・思索のひろば 77 | Saving cache... 78 | ``` 79 | 80 | ### Diagnose feature weight 81 | This subcommand list pairs of feature weight and its name. 82 | 83 | ```console 84 | % ./go-active-learning diagnose feature-weight --filter-status-code-ok | head -n 10 85 | +0.80 BODY:/ 86 | +0.80 BODY:ほか 87 | +0.80 BODY:郁 88 | +0.80 BODY:単行本 89 | +0.80 BODY:姿 90 | +0.80 BODY:暗黙 91 | +0.80 BODY:創造 92 | +0.80 BODY:企業 93 | +0.80 BODY:野中 94 | +0.80 BODY:準備 95 | ``` 96 | 97 | # Author 98 | Yasuhisa Yoshida 99 | -------------------------------------------------------------------------------- /dbconfig.yml: -------------------------------------------------------------------------------- 1 | test: 2 | dialect: postgres 3 | datasource: host=localhost user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} dbname=go-active-learning-test sslmode=disable 4 | dir: migrations 5 | 6 | local: 7 | dialect: postgres 8 | datasource: host=localhost user=nobody password=nobody dbname=go-active-learning sslmode=disable 9 | dir: migrations 10 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/syou6162/go-active-learning 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.5.1 7 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 // indirect 8 | github.com/fatih/set v0.2.1 // indirect 9 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect 10 | github.com/gorilla/websocket v1.4.0 // indirect 11 | github.com/ikawaha/kagome v1.11.2 12 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43 // indirect 13 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f 14 | github.com/jmoiron/sqlx v1.3.1 15 | github.com/lib/pq v1.10.0 16 | github.com/mackerelio/mackerel-client-go v0.16.0 17 | github.com/mattn/go-isatty v0.0.8 // indirect 18 | github.com/mattn/go-runewidth v0.0.4 // indirect 19 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859 20 | github.com/montanaflynn/stats v0.5.0 // indirect 21 | github.com/neurosnap/sentences v1.0.6 // indirect 22 | github.com/nlopes/slack v0.6.0 23 | github.com/olekukonko/tablewriter v0.0.1 // indirect 24 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4 25 | github.com/pkg/errors v0.9.1 26 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d // indirect 27 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect 28 | github.com/stretchr/testify v1.3.0 // indirect 29 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f 30 | github.com/urfave/cli v1.22.5 31 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa // indirect 32 | golang.org/x/text v0.3.2 // indirect 33 | gopkg.in/neurosnap/sentences.v1 v1.0.6 // indirect 34 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0 35 | ) 36 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 2 | github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= 3 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= 4 | github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= 5 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 6 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 h1:c4mLfegoDw6OhSJXTd2jUEQgZUQuJWtocudb97Qn9EM= 7 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI= 8 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= 9 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= 10 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 11 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 12 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 13 | github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA= 14 | github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI= 15 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I= 16 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs= 17 | github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs= 18 | github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= 19 | github.com/gorilla/websocket v1.2.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= 20 | github.com/gorilla/websocket v1.4.0 h1:WDFjx/TMzVgy9VdMMQi2K2Emtwi2QcUQsztZ/zLaH/Q= 21 | github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= 22 | github.com/ikawaha/kagome v1.11.2 h1:eCWpLqv5Euqa5JcwkaobUSy6uGM8rwwMw5Su3eRepBI= 23 | github.com/ikawaha/kagome v1.11.2/go.mod h1:lHwhkGuuWqKWTxeQMppD0EmQAfKbc39QKx9qoWqgo+A= 24 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43 h1:jTkyeF7NZ5oIr0ESmcrpiDgAfoidCBF4F5kJhjtaRwE= 25 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= 26 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f h1:AQ+AwWeEFf6NsjaMzhuVKLfxZH1+i7aoHuYXObQAzDo= 27 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f/go.mod h1:jkF0lkxaX5PFSlk9l4Gh9Y+T57TqUZziWT7uZbW5ADg= 28 | github.com/jmoiron/sqlx v1.3.1 h1:aLN7YINNZ7cYOPK3QC83dbM6KT0NMqVMw961TqrejlE= 29 | github.com/jmoiron/sqlx v1.3.1/go.mod h1:2BljVx/86SuTyjE+aPYlHCTNvZrnJXghYGpNiXLBMCQ= 30 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= 31 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= 32 | github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= 33 | github.com/lib/pq v1.10.0 h1:Zx5DJFEYQXio93kgXnQ09fXNiUKsqv4OUEu2UtGcB1E= 34 | github.com/lib/pq v1.10.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= 35 | github.com/mackerelio/mackerel-client-go v0.16.0 h1:9AoqOg+kX07QsVBGN8yD3Zx0skd+cGqESp7kXquDjDs= 36 | github.com/mackerelio/mackerel-client-go v0.16.0/go.mod h1:/GNOj+y1eFsd3CK8c6IQ/uS38/GT0+NWImk5YGJs5Lk= 37 | github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE= 38 | github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= 39 | github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y= 40 | github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= 41 | github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg= 42 | github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= 43 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859 h1:smQbSzmT3EHl4EUwtFwFGmGIpiYgIiiPeVv1uguIQEE= 44 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859/go.mod h1:XPvLUNfbS4fJH25nqRHfWLMa1ONC8Amw+mIA639KxkE= 45 | github.com/montanaflynn/stats v0.5.0 h1:2EkzeTSqBB4V4bJwWrt5gIIrZmpJBcoIRGS2kWLgzmk= 46 | github.com/montanaflynn/stats v0.5.0/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= 47 | github.com/neurosnap/sentences v1.0.6 h1:iBVUivNtlwGkYsJblWV8GGVFmXzZzak907Ci8aA0VTE= 48 | github.com/neurosnap/sentences v1.0.6/go.mod h1:pg1IapvYpWCJJm/Etxeh0+gtMf1rI1STY9S7eUCPbDc= 49 | github.com/nlopes/slack v0.6.0 h1:jt0jxVQGhssx1Ib7naAOZEZcGdtIhTzkP0nopK0AsRA= 50 | github.com/nlopes/slack v0.6.0/go.mod h1:JzQ9m3PMAqcpeCam7UaHSuBuupz7CmpjehYMayT6YOk= 51 | github.com/olekukonko/tablewriter v0.0.1 h1:b3iUnf1v+ppJiOfNX4yxxqfWKMQPZR5yoh8urCTFX88= 52 | github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= 53 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4 h1:49lOXmGaUpV9Fz3gd7TFZY106KVlPVa5jcYD1gaQf98= 54 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA= 55 | github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 56 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 57 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 58 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 59 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 60 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= 61 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 62 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d h1:rUbV6LJa5RXK3jT/4jnJUz3UkrXzW6cqB+n9Fkbv9jY= 63 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d/go.mod h1:2htx6lmL0NGLHlO8ZCf+lQBGBHIbEujyywxJArf+2Yc= 64 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= 65 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= 66 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= 67 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= 68 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 69 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 70 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 71 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 72 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f h1:KZTZZaZYr4F+0V3AUEs2ZvOGYFlUKFdAWt+CkyhC2Wc= 73 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f/go.mod h1:T2hVrnNfCW4aQcCS7ReyHEKMEZat4F+fxMCzBlf1Q8g= 74 | github.com/urfave/cli v1.22.5 h1:lNq9sAHXK2qfdI8W+GRItjCEkI+2oR4d+MEHy1CKXoU= 75 | github.com/urfave/cli v1.22.5/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= 76 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 77 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 78 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 79 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b h1:Wh+f8QHJXR411sJR8/vRBTZ7YapZaRvUcLFFJhusH0k= 80 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= 81 | golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= 82 | golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ= 83 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 84 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 85 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 86 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 87 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 88 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b h1:0mm1VjtFUOIlE1SbDlwjYaDxZVDP2S5ou6y0gSgXHu8= 89 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 90 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 91 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 92 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 93 | golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 94 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 95 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa h1:KIDDMLT1O0Nr7TSxp8xM5tJcdn8tgyAONntO829og1M= 96 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 97 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 98 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= 99 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 100 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 101 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 102 | golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= 103 | golang.org/x/tools v0.0.0-20200612022331-742c5eb664c2 h1:DVqHa33CzfnTKwUV6be+I4hp31W6iXn3ZiEcdKGzLyI= 104 | golang.org/x/tools v0.0.0-20200612022331-742c5eb664c2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= 105 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 106 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 107 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= 108 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 109 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 110 | gopkg.in/neurosnap/sentences.v1 v1.0.6 h1:v7ElyP020iEZQONyLld3fHILHWOPs+ntzuQTNPkul8E= 111 | gopkg.in/neurosnap/sentences.v1 v1.0.6/go.mod h1:YlK+SN+fLQZj+kY3r8DkGDhDr91+S3JmTb5LSxFRQo0= 112 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0 h1:YY+ZVPsg2oJnV1rpzwIWtuCtQk71YFwuk47mMtjraN4= 113 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0/go.mod h1:6LhSPGi1OSJsWUQZridpjQXWEnDzw7EZAXSjc5SyF8A= 114 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 115 | -------------------------------------------------------------------------------- /lib/add/add.go: -------------------------------------------------------------------------------- 1 | package add 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "time" 7 | 8 | "os" 9 | 10 | mkr "github.com/mackerelio/mackerel-client-go" 11 | "github.com/syou6162/go-active-learning/lib/classifier" 12 | "github.com/syou6162/go-active-learning/lib/hatena_bookmark" 13 | "github.com/syou6162/go-active-learning/lib/service" 14 | "github.com/syou6162/go-active-learning/lib/util" 15 | "github.com/syou6162/go-active-learning/lib/util/file" 16 | "github.com/urfave/cli" 17 | ) 18 | 19 | func doAdd(c *cli.Context) error { 20 | inputFilename := c.String("input-filename") 21 | 22 | if inputFilename == "" { 23 | _ = cli.ShowCommandHelp(c, "add") 24 | return cli.NewExitError("`input-filename` is a required field.", 1) 25 | } 26 | 27 | app, err := service.NewDefaultApp() 28 | if err != nil { 29 | return err 30 | } 31 | defer app.Close() 32 | 33 | examples, err := file.ReadExamples(inputFilename) 34 | if err != nil { 35 | return err 36 | } 37 | 38 | if err := app.AttachMetadata(examples, 0, 0); err != nil { 39 | return err 40 | } 41 | 42 | examples = util.FilterStatusCodeNotOkExamples(examples) 43 | app.Fetch(examples) 44 | examples = util.FilterStatusCodeOkExamples(examples) 45 | 46 | m, err := app.FindLatestMIRAModel(classifier.EXAMPLE) 47 | skipPredictScore := false 48 | if err != nil { 49 | log.Println(fmt.Sprintf("Error to load model %s", err.Error())) 50 | skipPredictScore = true 51 | } 52 | 53 | for _, e := range examples { 54 | if !skipPredictScore { 55 | e.Score = m.PredictScore(e.Fv) 56 | } 57 | if e.CreatedAt.Before(time.Date(2000, 01, 01, 0, 0, 0, 0, time.Local)) { 58 | log.Println(fmt.Sprintf("Skipin too old example: %s", e.Url)) 59 | continue 60 | } 61 | if err = app.UpdateOrCreateExample(e); err != nil { 62 | log.Println(fmt.Sprintf("Error occured proccessing %s %s", e.Url, err.Error())) 63 | continue 64 | } 65 | if err = app.UpdateFeatureVector(e); err != nil { 66 | log.Println(fmt.Sprintf("Error occured proccessing %s feature vector %s", e.Url, err.Error())) 67 | continue 68 | } 69 | if bookmark, err := hatena_bookmark.GetHatenaBookmark(e.FinalUrl); err == nil { 70 | e.HatenaBookmark = bookmark 71 | app.UpdateHatenaBookmark(e) 72 | } 73 | } 74 | 75 | if err := postNumOfExamplesToMackerel(app); err != nil { 76 | return err 77 | } 78 | 79 | return nil 80 | } 81 | 82 | func postNumOfExamplesToMackerel(app service.GoActiveLearningApp) error { 83 | cnt, err := app.CountPositiveExamples() 84 | if err != nil { 85 | return err 86 | } 87 | if err := postNumOfExamplesByLabelToMackerel("count.positive", cnt); err != nil { 88 | return err 89 | } 90 | 91 | cnt, err = app.CountNegativeExamples() 92 | if err != nil { 93 | return err 94 | } 95 | if err := postNumOfExamplesByLabelToMackerel("count.negative", cnt); err != nil { 96 | return err 97 | } 98 | 99 | cnt, err = app.CountUnlabeledExamples() 100 | if err != nil { 101 | return err 102 | } 103 | if err := postNumOfExamplesByLabelToMackerel("count.unlabeled", cnt); err != nil { 104 | return err 105 | } 106 | return nil 107 | } 108 | 109 | func postNumOfExamplesByLabelToMackerel(label string, cnt int) error { 110 | apiKey := os.Getenv("MACKEREL_APIKEY") 111 | serviceName := os.Getenv("MACKEREL_SERVICE_NAME") 112 | if apiKey == "" || serviceName == "" { 113 | return nil 114 | } 115 | 116 | client := mkr.NewClient(apiKey) 117 | now := time.Now().Unix() 118 | err := client.PostServiceMetricValues(serviceName, []*mkr.MetricValue{ 119 | { 120 | Name: label, 121 | Time: now, 122 | Value: cnt, 123 | }, 124 | }) 125 | return err 126 | } 127 | 128 | var CommandAdd = cli.Command{ 129 | Name: "add", 130 | Usage: "add urls", 131 | Description: ` 132 | Add urls. 133 | `, 134 | Action: doAdd, 135 | Flags: []cli.Flag{ 136 | cli.StringFlag{Name: "input-filename"}, 137 | }, 138 | } 139 | -------------------------------------------------------------------------------- /lib/add/add_test.go: -------------------------------------------------------------------------------- 1 | package add_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/command" 7 | "github.com/urfave/cli" 8 | ) 9 | 10 | func TestDoAdd(t *testing.T) { 11 | app := cli.NewApp() 12 | app.Commands = command.Commands 13 | args := []string{ 14 | "go-active-learning-web", 15 | "add", 16 | "--input-filename=../../tech_input_example.txt", 17 | } 18 | 19 | if err := app.Run(args); err != nil { 20 | t.Error(err) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lib/annotation/annotation.go: -------------------------------------------------------------------------------- 1 | package annotation 2 | 3 | import ( 4 | "github.com/syou6162/go-active-learning/lib/classifier" 5 | "github.com/syou6162/go-active-learning/lib/model" 6 | "github.com/urfave/cli" 7 | ) 8 | 9 | type ActionType int 10 | 11 | const ( 12 | LABEL_AS_POSITIVE ActionType = iota 13 | LABEL_AS_NEGATIVE 14 | HELP 15 | SKIP 16 | EXIT 17 | ) 18 | 19 | func rune2ActionType(r rune) ActionType { 20 | switch r { 21 | case 'p': 22 | return LABEL_AS_POSITIVE 23 | case 'n': 24 | return LABEL_AS_NEGATIVE 25 | case 's': 26 | return SKIP 27 | case 'h': 28 | return HELP 29 | case 'e': 30 | return EXIT 31 | default: 32 | return HELP 33 | } 34 | } 35 | 36 | func NextExampleToBeAnnotated(m classifier.MIRAClassifier, examples model.Examples) *model.Example { 37 | unlabeledExamples := m.SortByScore(examples) 38 | if len(unlabeledExamples) == 0 { 39 | return nil 40 | } 41 | e := unlabeledExamples[0] 42 | if e == nil { 43 | return nil 44 | } 45 | return e 46 | } 47 | 48 | var ActionHelpDoc = ` 49 | p: Label this example as positive. 50 | n: Label this example as negative. 51 | s: Skip this example. 52 | h: Show this help. 53 | e: Exit. 54 | ` 55 | 56 | var CommandAnnotate = cli.Command{ 57 | Name: "annotate", 58 | Usage: "Annotate URLs", 59 | Description: ` 60 | Annotate URLs using active learning. 61 | `, 62 | Subcommands: []cli.Command{ 63 | { 64 | Name: "cli", 65 | Usage: "Annotate URLs using cli", 66 | Description: ` 67 | Annotate URLs using active learning using cli. 68 | `, 69 | Action: doAnnotate, 70 | Flags: []cli.Flag{ 71 | cli.BoolFlag{Name: "open-url", Usage: "Open url in background"}, 72 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"}, 73 | cli.BoolFlag{Name: "show-active-features"}, 74 | }, 75 | }, 76 | { 77 | Name: "slack", 78 | Usage: "Annotate URLs using slack", 79 | Description: ` 80 | Annotate URLs using active learning using slack. 81 | `, 82 | Action: doAnnotateWithSlack, 83 | Flags: []cli.Flag{ 84 | cli.StringFlag{Name: "channel"}, 85 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"}, 86 | }, 87 | }, 88 | }, 89 | } 90 | -------------------------------------------------------------------------------- /lib/annotation/annotation_cli.go: -------------------------------------------------------------------------------- 1 | package annotation 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "math" 8 | "sort" 9 | 10 | "github.com/mattn/go-tty" 11 | "github.com/pkg/browser" 12 | "github.com/syou6162/go-active-learning/lib/classifier" 13 | "github.com/syou6162/go-active-learning/lib/example" 14 | "github.com/syou6162/go-active-learning/lib/model" 15 | "github.com/syou6162/go-active-learning/lib/service" 16 | "github.com/syou6162/go-active-learning/lib/util" 17 | "github.com/syou6162/go-active-learning/lib/util/converter" 18 | "github.com/urfave/cli" 19 | ) 20 | 21 | func input2ActionType() (ActionType, error) { 22 | t, err := tty.Open() 23 | defer t.Close() 24 | if err != nil { 25 | return EXIT, err 26 | } 27 | var r rune 28 | for r == 0 { 29 | r, err = t.ReadRune() 30 | if err != nil { 31 | return HELP, err 32 | } 33 | } 34 | return rune2ActionType(r), nil 35 | } 36 | 37 | func doAnnotate(c *cli.Context) error { 38 | openUrl := c.Bool("open-url") 39 | filterStatusCodeOk := c.Bool("filter-status-code-ok") 40 | showActiveFeatures := c.Bool("show-active-features") 41 | 42 | app, err := service.NewDefaultApp() 43 | if err != nil { 44 | return err 45 | } 46 | defer app.Close() 47 | 48 | examples, err := app.SearchExamples() 49 | if err != nil { 50 | return err 51 | } 52 | 53 | stat := example.GetStat(examples) 54 | fmt.Fprintln(os.Stderr, fmt.Sprintf("Positive:%d, Negative:%d, Unlabeled:%d", stat["positive"], stat["negative"], stat["unlabeled"])) 55 | 56 | app.Fetch(examples) 57 | for _, e := range examples { 58 | app.UpdateFeatureVector(e) 59 | } 60 | if filterStatusCodeOk { 61 | examples = util.FilterStatusCodeOkExamples(examples) 62 | } 63 | 64 | m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples)) 65 | if err != nil { 66 | return err 67 | } 68 | 69 | annotationLoop: 70 | for { 71 | e := NextExampleToBeAnnotated(*m, examples) 72 | if e == nil { 73 | fmt.Println("No example") 74 | break annotationLoop 75 | } 76 | fmt.Println("Label this example (Score: " + fmt.Sprintf("%+0.03f", e.Score) + "): " + e.Url + " (" + e.Title + ")") 77 | 78 | if openUrl { 79 | browser.OpenURL(e.Url) 80 | } 81 | if showActiveFeatures { 82 | ShowActiveFeatures(*m, *e, 5) 83 | } 84 | 85 | act, err := input2ActionType() 86 | if err != nil { 87 | return err 88 | } 89 | switch act { 90 | case LABEL_AS_POSITIVE: 91 | fmt.Println("Labeled as positive") 92 | e.Annotate(model.POSITIVE) 93 | app.UpdateOrCreateExample(e) 94 | case LABEL_AS_NEGATIVE: 95 | fmt.Println("Labeled as negative") 96 | e.Annotate(model.NEGATIVE) 97 | app.UpdateOrCreateExample(e) 98 | case SKIP: 99 | fmt.Println("Skiped this example") 100 | examples = util.RemoveExample(examples, *e) 101 | continue 102 | case HELP: 103 | fmt.Println(ActionHelpDoc) 104 | case EXIT: 105 | fmt.Println("EXIT") 106 | break annotationLoop 107 | default: 108 | break annotationLoop 109 | } 110 | 111 | m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples)) 112 | if err != nil { 113 | return err 114 | } 115 | } 116 | 117 | return nil 118 | } 119 | 120 | type FeatureWeightPair struct { 121 | Feature string 122 | Weight float64 123 | } 124 | 125 | type FeatureWeightPairs []FeatureWeightPair 126 | 127 | func SortedActiveFeatures(model classifier.MIRAClassifier, example model.Example, n int) FeatureWeightPairs { 128 | pairs := FeatureWeightPairs{} 129 | for _, f := range example.Fv { 130 | pairs = append(pairs, FeatureWeightPair{f, model.GetWeight(f)}) 131 | } 132 | sort.Sort(sort.Reverse(pairs)) 133 | 134 | result := FeatureWeightPairs{} 135 | cnt := 0 136 | for _, pair := range pairs { 137 | if cnt >= n { 138 | break 139 | } 140 | if (example.Score > 0.0 && pair.Weight > 0.0) || (example.Score < 0.0 && pair.Weight < 0.0) { 141 | result = append(result, pair) 142 | cnt++ 143 | } 144 | } 145 | return result 146 | } 147 | 148 | func ShowActiveFeatures(model classifier.MIRAClassifier, example model.Example, n int) { 149 | for _, pair := range SortedActiveFeatures(model, example, n) { 150 | fmt.Println(fmt.Sprintf("%+0.1f %s", pair.Weight, pair.Feature)) 151 | } 152 | } 153 | 154 | func (slice FeatureWeightPairs) Len() int { 155 | return len(slice) 156 | } 157 | 158 | func (slice FeatureWeightPairs) Less(i, j int) bool { 159 | return math.Abs(slice[i].Weight) < math.Abs(slice[j].Weight) 160 | } 161 | 162 | func (slice FeatureWeightPairs) Swap(i, j int) { 163 | slice[i], slice[j] = slice[j], slice[i] 164 | } 165 | -------------------------------------------------------------------------------- /lib/annotation/annotation_slack.go: -------------------------------------------------------------------------------- 1 | package annotation 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/nlopes/slack" 8 | "github.com/pkg/errors" 9 | "github.com/syou6162/go-active-learning/lib/classifier" 10 | "github.com/syou6162/go-active-learning/lib/example" 11 | "github.com/syou6162/go-active-learning/lib/model" 12 | "github.com/syou6162/go-active-learning/lib/service" 13 | "github.com/syou6162/go-active-learning/lib/util" 14 | "github.com/syou6162/go-active-learning/lib/util/converter" 15 | "github.com/urfave/cli" 16 | ) 17 | 18 | func doAnnotateWithSlack(c *cli.Context) error { 19 | channelID := c.String("channel") 20 | filterStatusCodeOk := c.Bool("filter-status-code-ok") 21 | 22 | if channelID == "" { 23 | _ = cli.ShowCommandHelp(c, "slack") 24 | return cli.NewExitError("`channel` is a required field.", 1) 25 | } 26 | 27 | api := slack.New(os.Getenv("SLACK_TOKEN")) 28 | rtm := api.NewRTM() 29 | go rtm.ManageConnection() 30 | 31 | app, err := service.NewDefaultApp() 32 | if err != nil { 33 | return err 34 | } 35 | defer app.Close() 36 | 37 | examples, err := app.SearchExamples() 38 | if err != nil { 39 | return err 40 | } 41 | 42 | stat := example.GetStat(examples) 43 | msg := rtm.NewOutgoingMessage(fmt.Sprintf("Positive:%d, Negative:%d, Unlabeled:%d", stat["positive"], stat["negative"], stat["unlabeled"]), channelID) 44 | rtm.SendMessage(msg) 45 | 46 | app.Fetch(examples) 47 | for _, e := range examples { 48 | app.UpdateFeatureVector(e) 49 | } 50 | if filterStatusCodeOk { 51 | examples = util.FilterStatusCodeOkExamples(examples) 52 | } 53 | 54 | m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples)) 55 | if err != nil { 56 | return err 57 | } 58 | e := NextExampleToBeAnnotated(*m, examples) 59 | if e == nil { 60 | return errors.New("No e to annotate") 61 | } 62 | 63 | rtm.SendMessage(rtm.NewOutgoingMessage("Ready to annotate!", channelID)) 64 | showExample(rtm, *m, e, channelID) 65 | prevTimestamp := "" 66 | 67 | annotationLoop: 68 | for { 69 | select { 70 | case msg := <-rtm.IncomingEvents: 71 | switch ev := msg.Data.(type) { 72 | case *slack.AckMessage: 73 | prevTimestamp = ev.Timestamp 74 | case *slack.MessageEvent: 75 | if ev.Channel != channelID { 76 | break 77 | } 78 | text := ev.Text 79 | if len(text) > 1 || len(text) == 0 { 80 | break 81 | } 82 | r := []rune(text)[0] 83 | act := rune2ActionType(r) 84 | 85 | switch act { 86 | case LABEL_AS_POSITIVE: 87 | e.Annotate(model.POSITIVE) 88 | m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples)) 89 | if err != nil { 90 | return err 91 | } 92 | rtm.AddReaction("heavy_plus_sign", slack.NewRefToMessage(channelID, prevTimestamp)) 93 | case LABEL_AS_NEGATIVE: 94 | e.Annotate(model.NEGATIVE) 95 | m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples)) 96 | if err != nil { 97 | return err 98 | } 99 | rtm.AddReaction("heavy_minus_sign", slack.NewRefToMessage(channelID, prevTimestamp)) 100 | case SKIP: 101 | rtm.SendMessage(rtm.NewOutgoingMessage("Skiped this e", channelID)) 102 | examples = util.RemoveExample(examples, *e) 103 | break 104 | case HELP: 105 | rtm.SendMessage(rtm.NewOutgoingMessage(ActionHelpDoc, channelID)) 106 | case EXIT: 107 | rtm.SendMessage(rtm.NewOutgoingMessage("EXIT", channelID)) 108 | break annotationLoop 109 | default: 110 | break annotationLoop 111 | } 112 | e = NextExampleToBeAnnotated(*m, examples) 113 | if e == nil { 114 | return errors.New("No e to annotate") 115 | } 116 | showExample(rtm, *m, e, channelID) 117 | case *slack.InvalidAuthEvent: 118 | return errors.New("Invalid credentials") 119 | default: 120 | } 121 | } 122 | } 123 | return nil 124 | } 125 | 126 | func showExample(rtm *slack.RTM, model classifier.MIRAClassifier, example *model.Example, channelID string) { 127 | activeFeaturesStr := "Active Features: " 128 | for _, pair := range SortedActiveFeatures(model, *example, 5) { 129 | activeFeaturesStr += fmt.Sprintf("%s(%+0.1f) ", pair.Feature, pair.Weight) 130 | } 131 | rtm.SendMessage(rtm.NewOutgoingMessage(fmt.Sprintf("%s\nScore: %+0.2f\n%s", example.Url, example.Score, activeFeaturesStr), channelID)) 132 | } 133 | -------------------------------------------------------------------------------- /lib/classifier/mira.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "math/rand" 7 | "os" 8 | "runtime" 9 | "sort" 10 | "sync" 11 | 12 | "github.com/pkg/errors" 13 | "github.com/syou6162/go-active-learning/lib/evaluation" 14 | "github.com/syou6162/go-active-learning/lib/feature" 15 | "github.com/syou6162/go-active-learning/lib/model" 16 | "github.com/syou6162/go-active-learning/lib/util" 17 | ) 18 | 19 | type ModelType int 20 | 21 | const ( 22 | EXAMPLE ModelType = 0 23 | TWITTER ModelType = 1 24 | ) 25 | 26 | type MIRAClassifier struct { 27 | ModelType ModelType `json:"ModelType"` 28 | Weight map[string]float64 `json:"Weight"` 29 | C float64 `json:"C"` 30 | Accuracy float64 `json:"Accuracy"` 31 | Precision float64 `json:"Precision"` 32 | Recall float64 `json:"Recall"` 33 | Fvalue float64 `json:"Fvalue"` 34 | } 35 | 36 | type LearningInstance interface { 37 | GetFeatureVector() feature.FeatureVector 38 | GetLabel() model.LabelType 39 | } 40 | 41 | type LearningInstances []LearningInstance 42 | 43 | var errNoTrainingInstances = errors.New("Empty training set") 44 | var errNoDevelopmentInstances = errors.New("Empty development set") 45 | var errNoMIRAModelLearned = errors.New("Fail to learn MIRA models") 46 | var errModelEvaluationFailure = errors.New("Failed to evaluate best MIRA") 47 | var errTrainingInstancesAllPositive = errors.New("Labels of training instances are all positive") 48 | var errTrainingInstancesAllNegative = errors.New("Labels of training instances are all negative") 49 | var errDevelopmentInstancesAllPositive = errors.New("Labels of development instances are all positive") 50 | var errDevelopmentInstancesAllNegative = errors.New("Labels of development instances are all negative") 51 | 52 | func newMIRAClassifier(modelType ModelType, c float64) *MIRAClassifier { 53 | return &MIRAClassifier{ 54 | ModelType: modelType, 55 | Weight: make(map[string]float64), 56 | C: c, 57 | Accuracy: 0.0, 58 | Precision: 0.0, 59 | Recall: 0.0, 60 | Fvalue: 0.0, 61 | } 62 | } 63 | 64 | func filterLabeledInstances(instances LearningInstances) LearningInstances { 65 | var result LearningInstances 66 | for _, i := range instances { 67 | if i.GetLabel() != 0 { 68 | result = append(result, i) 69 | } 70 | } 71 | return result 72 | } 73 | 74 | func shuffle(instances LearningInstances) { 75 | n := len(instances) 76 | for i := n - 1; i >= 0; i-- { 77 | j := rand.Intn(i + 1) 78 | instances[i], instances[j] = instances[j], instances[i] 79 | } 80 | } 81 | 82 | func splitTrainAndDev(instances LearningInstances) (train LearningInstances, dev LearningInstances) { 83 | shuffle(instances) 84 | n := int(0.8 * float64(len(instances))) 85 | return instances[0:n], instances[n:] 86 | } 87 | 88 | func NewMIRAClassifier(modelType ModelType, instances LearningInstances, c float64) *MIRAClassifier { 89 | train := filterLabeledInstances(instances) 90 | model := newMIRAClassifier(modelType, c) 91 | for iter := 0; iter < 30; iter++ { 92 | shuffle(train) 93 | for _, example := range train { 94 | model.learn(example) 95 | } 96 | } 97 | return model 98 | } 99 | 100 | func overSamplingPositiveExamples(instances LearningInstances) LearningInstances { 101 | overSampled := LearningInstances{} 102 | posInstances := LearningInstances{} 103 | negInstances := LearningInstances{} 104 | 105 | numNeg := 0 106 | 107 | for _, i := range instances { 108 | if i.GetLabel() == model.NEGATIVE { 109 | numNeg += 1 110 | negInstances = append(negInstances, i) 111 | } else if i.GetLabel() == model.POSITIVE { 112 | posInstances = append(posInstances, i) 113 | } 114 | } 115 | 116 | for len(overSampled) <= numNeg { 117 | shuffle(posInstances) 118 | overSampled = append(overSampled, posInstances[0]) 119 | } 120 | overSampled = append(overSampled, negInstances...) 121 | shuffle(overSampled) 122 | 123 | return overSampled 124 | } 125 | 126 | func extractGoldLabels(instances LearningInstances) []model.LabelType { 127 | golds := make([]model.LabelType, 0, 0) 128 | for _, i := range instances { 129 | golds = append(golds, i.GetLabel()) 130 | } 131 | return golds 132 | } 133 | 134 | type MIRAClassifierList []MIRAClassifier 135 | 136 | func (l MIRAClassifierList) Len() int { return len(l) } 137 | func (l MIRAClassifierList) Less(i, j int) bool { return l[i].Fvalue < l[j].Fvalue } 138 | func (l MIRAClassifierList) Swap(i, j int) { l[i], l[j] = l[j], l[i] } 139 | 140 | func allSameLabel(instances LearningInstances, label model.LabelType) bool { 141 | for _, instance := range instances { 142 | if instance.GetLabel() != label { 143 | return false 144 | } 145 | } 146 | return true 147 | } 148 | 149 | func isValidTrainAndDevelopmentInstances(train LearningInstances, dev LearningInstances) (bool, error) { 150 | if len(train) == 0 { 151 | return false, errNoTrainingInstances 152 | } 153 | if len(dev) == 0 { 154 | return false, errNoDevelopmentInstances 155 | } 156 | 157 | if allSameLabel(train, model.POSITIVE) { 158 | return false, errTrainingInstancesAllPositive 159 | } 160 | if allSameLabel(train, model.NEGATIVE) { 161 | return false, errTrainingInstancesAllNegative 162 | } 163 | if allSameLabel(dev, model.POSITIVE) { 164 | return false, errDevelopmentInstancesAllPositive 165 | } 166 | if allSameLabel(dev, model.NEGATIVE) { 167 | return false, errDevelopmentInstancesAllNegative 168 | } 169 | 170 | return true, nil 171 | } 172 | 173 | func NewMIRAClassifierByCrossValidation(modelType ModelType, instances LearningInstances) (*MIRAClassifier, error) { 174 | shuffle(instances) 175 | train, dev := splitTrainAndDev(filterLabeledInstances(instances)) 176 | if valid, err := isValidTrainAndDevelopmentInstances(train, dev); !valid { 177 | return nil, err 178 | } 179 | 180 | train = overSamplingPositiveExamples(train) 181 | 182 | params := []float64{1000, 500, 100, 50, 10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001} 183 | miraResults := MIRAClassifierList{} 184 | 185 | wg := &sync.WaitGroup{} 186 | cpus := runtime.NumCPU() 187 | runtime.GOMAXPROCS(cpus) 188 | 189 | models := make([]*MIRAClassifier, len(params)) 190 | for idx, c := range params { 191 | wg.Add(1) 192 | go func(idx int, c float64) { 193 | defer wg.Done() 194 | model := NewMIRAClassifier(modelType, train, c) 195 | models[idx] = model 196 | }(idx, c) 197 | } 198 | wg.Wait() 199 | 200 | if len(models) == 0 { 201 | return nil, errNoMIRAModelLearned 202 | } 203 | 204 | maxFvalue := math.Inf(-1) 205 | for _, m := range models { 206 | devPredicts := make([]model.LabelType, len(dev)) 207 | for i, instance := range dev { 208 | devPredicts[i] = m.Predict(instance.GetFeatureVector()) 209 | } 210 | m.Accuracy = evaluation.GetAccuracy(extractGoldLabels(dev), devPredicts) 211 | m.Precision = evaluation.GetPrecision(extractGoldLabels(dev), devPredicts) 212 | m.Recall = evaluation.GetRecall(extractGoldLabels(dev), devPredicts) 213 | m.Fvalue = (2 * m.Recall * m.Precision) / (m.Recall + m.Precision) 214 | fmt.Fprintln(os.Stderr, fmt.Sprintf("C:%0.03f\tAccuracy:%0.03f\tPrecision:%0.03f\tRecall:%0.03f\tF-value:%0.03f", m.C, m.Accuracy, m.Precision, m.Recall, m.Fvalue)) 215 | tp, fp, fn, tn := evaluation.GetConfusionMatrix(extractGoldLabels(dev), devPredicts) 216 | fmt.Fprintln(os.Stderr, fmt.Sprintf("tp:%d\tfp:%d\tfn:%d\ttn:%d", tp, fp, fn, tn)) 217 | if math.IsNaN(m.Fvalue) { 218 | continue 219 | } 220 | miraResults = append(miraResults, *m) 221 | if m.Fvalue >= maxFvalue { 222 | maxFvalue = m.Fvalue 223 | } 224 | } 225 | if len(miraResults) == 0 { 226 | return nil, errModelEvaluationFailure 227 | } 228 | 229 | sort.Sort(sort.Reverse(miraResults)) 230 | bestModel := &miraResults[0] 231 | instances = overSamplingPositiveExamples(instances) 232 | shuffle(instances) 233 | result := NewMIRAClassifier(modelType, filterLabeledInstances(instances), bestModel.C) 234 | result.Accuracy = bestModel.Accuracy 235 | result.Precision = bestModel.Precision 236 | result.Recall = bestModel.Recall 237 | result.Fvalue = bestModel.Fvalue 238 | return result, nil 239 | } 240 | 241 | func (m *MIRAClassifier) learn(instance LearningInstance) { 242 | tmp := float64(instance.GetLabel()) * m.PredictScore(instance.GetFeatureVector()) // y w^T x 243 | loss := 0.0 244 | if tmp < 1.0 { 245 | loss = 1 - tmp 246 | } 247 | 248 | norm := float64(len(instance.GetFeatureVector()) * len(instance.GetFeatureVector())) 249 | // tau := math.Min(m.C, loss/norm) // update by PA-I 250 | tau := loss / (norm + 1.0/m.C) // update by PA-II 251 | 252 | if tau != 0.0 { 253 | for _, f := range instance.GetFeatureVector() { 254 | w, _ := m.Weight[f] 255 | m.Weight[f] = w + tau*float64(instance.GetLabel()) 256 | } 257 | } 258 | } 259 | 260 | func (m MIRAClassifier) PredictScore(features feature.FeatureVector) float64 { 261 | result := 0.0 262 | for _, f := range features { 263 | w, ok := m.Weight[f] 264 | if ok { 265 | result = result + w*1.0 266 | } 267 | } 268 | return result 269 | } 270 | 271 | func (m MIRAClassifier) Predict(features feature.FeatureVector) model.LabelType { 272 | if m.PredictScore(features) > 0 { 273 | return model.POSITIVE 274 | } 275 | return model.NEGATIVE 276 | } 277 | 278 | func (m MIRAClassifier) SortByScore(examples model.Examples) model.Examples { 279 | var unlabeledExamples model.Examples 280 | for _, e := range util.FilterUnlabeledExamples(examples) { 281 | e.Score = m.PredictScore(e.Fv) 282 | if !e.IsLabeled() && e.Score != 0.0 { 283 | unlabeledExamples = append(unlabeledExamples, e) 284 | } 285 | } 286 | 287 | sort.Sort(unlabeledExamples) 288 | return unlabeledExamples 289 | } 290 | 291 | func (m MIRAClassifier) GetWeight(f string) float64 { 292 | w, ok := m.Weight[f] 293 | if ok { 294 | return w 295 | } 296 | return 0.0 297 | } 298 | 299 | func (m MIRAClassifier) GetActiveFeatures() []string { 300 | result := make([]string, 0) 301 | for f := range m.Weight { 302 | result = append(result, f) 303 | } 304 | return result 305 | } 306 | -------------------------------------------------------------------------------- /lib/classifier/mira_test.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/example" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | ) 9 | 10 | func TestPredictScore(t *testing.T) { 11 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 12 | e1.Title = "bookmark" 13 | e1.Fv = []string{"hoge", "fuga"} 14 | e2 := example.NewExample("http://google.com", model.NEGATIVE) 15 | e2.Title = "google" 16 | e2.Fv = []string{"piyo", "aaa"} 17 | e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE) 18 | e3.Title = "hatena" 19 | e3.Fv = []string{"hoge", "fuga"} 20 | e4 := example.NewExample("http://hogehoge.com", model.UNLABELED) 21 | e4.Title = "hogehoge" 22 | e4.Fv = []string{"piyo", "hoge"} 23 | 24 | examples := LearningInstances{e1, e2, e3, e4} 25 | c := NewMIRAClassifier(EXAMPLE, examples, 1.0) 26 | 27 | if c.PredictScore(e4.Fv) < 0.0 { 28 | t.Errorf("c.PredictScore(e4.Fv) == %f, want >= 0", c.PredictScore(e4.Fv)) 29 | } 30 | } 31 | 32 | func TestSplitTrainAndDev(t *testing.T) { 33 | e1 := example.NewExample("http://a.hatena.ne.jp", model.POSITIVE) 34 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 35 | e3 := example.NewExample("http://google.com", model.UNLABELED) 36 | e4 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE) 37 | e5 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 38 | e6 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE) 39 | e7 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 40 | e8 := example.NewExample("http://google.com", model.UNLABELED) 41 | e9 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE) 42 | e10 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 43 | 44 | train, dev := splitTrainAndDev(LearningInstances{e1, e2, e3, e4, e5, e6, e7, e8, e9, e10}) 45 | if len(train) != 8 { 46 | t.Error("Number of training examples should be 8") 47 | } 48 | if len(dev) != 2 { 49 | t.Error("Number of dev examples should be 2") 50 | } 51 | } 52 | 53 | func TestGetWeight(t *testing.T) { 54 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 55 | e1.Title = "bookmark" 56 | e1.Fv = []string{"hoge", "fuga"} 57 | e2 := example.NewExample("http://google.com", model.NEGATIVE) 58 | e2.Title = "google" 59 | e2.Fv = []string{"piyo", "aaa"} 60 | e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE) 61 | e3.Title = "hatena" 62 | e3.Fv = []string{"hoge", "fuga"} 63 | e4 := example.NewExample("http://hogehoge.com", model.UNLABELED) 64 | e4.Title = "hogehoge" 65 | e4.Fv = []string{"piyo", "hoge"} 66 | 67 | examples := LearningInstances{e1, e2, e3, e4} 68 | c := NewMIRAClassifier(EXAMPLE, examples, 1.0) 69 | 70 | if c.GetWeight("hoge") <= 0.0 { 71 | t.Errorf("c.GetWeight('hoge') == %f, want > 0", c.GetWeight("hoge")) 72 | } 73 | } 74 | 75 | func TestGetActiveFeatures(t *testing.T) { 76 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 77 | e1.Title = "bookmark" 78 | e1.Fv = []string{"hoge", "fuga"} 79 | e2 := example.NewExample("http://google.com", model.NEGATIVE) 80 | e2.Title = "google" 81 | e2.Fv = []string{"piyo", "aaa"} 82 | e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE) 83 | e3.Title = "hatena" 84 | e3.Fv = []string{"hoge", "fuga"} 85 | e4 := example.NewExample("http://hogehoge.com", model.UNLABELED) 86 | e4.Title = "hogehoge" 87 | e4.Fv = []string{"piyo", "hoge"} 88 | 89 | examples := LearningInstances{e1, e2, e3, e4} 90 | c := NewMIRAClassifier(EXAMPLE, examples, 1.0) 91 | 92 | if len(c.GetActiveFeatures()) <= 0 { 93 | t.Errorf("len(c.GetActiveFeatures()) <= %d, want > 0", len(c.GetActiveFeatures())) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /lib/command/command.go: -------------------------------------------------------------------------------- 1 | package command 2 | 3 | import ( 4 | "github.com/syou6162/go-active-learning/lib/add" 5 | "github.com/syou6162/go-active-learning/lib/annotation" 6 | "github.com/syou6162/go-active-learning/lib/diagnosis" 7 | "github.com/syou6162/go-active-learning/lib/related_example" 8 | "github.com/syou6162/go-active-learning/lib/top_accessed_example" 9 | "github.com/urfave/cli" 10 | ) 11 | 12 | var Commands = []cli.Command{ 13 | add.CommandAdd, 14 | related_example.CommandAddRelatedExamples, 15 | annotation.CommandAnnotate, 16 | top_accessed_example.CommandAddTopAccessedExamples, 17 | diagnosis.CommandDiagnose, 18 | } 19 | -------------------------------------------------------------------------------- /lib/diagnosis/diagnosis.go: -------------------------------------------------------------------------------- 1 | package diagnosis 2 | 3 | import ( 4 | featureweight "github.com/syou6162/go-active-learning/lib/diagnosis/feature_weight" 5 | labelconflict "github.com/syou6162/go-active-learning/lib/diagnosis/label_conflict" 6 | "github.com/urfave/cli" 7 | ) 8 | 9 | var CommandDiagnose = cli.Command{ 10 | Name: "diagnose", 11 | Usage: "Diagnose training data or learned model", 12 | Description: ` 13 | Diagnose training data or learned model. This mode has two subcommand: label-conflict and feature-weight. 14 | `, 15 | 16 | Subcommands: []cli.Command{ 17 | { 18 | Name: "label-conflict", 19 | Usage: "Diagnose label conflicts in training data", 20 | Description: ` 21 | Diagnose label conflicts in training data. 'conflict' means that an annotated label is '-1/1', but a predicted label by model is '1/-1'. 22 | `, 23 | Action: labelconflict.DoLabelConflict, 24 | Flags: []cli.Flag{ 25 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"}, 26 | }, 27 | }, 28 | { 29 | Name: "feature-weight", 30 | Usage: "List feature weight", 31 | Description: ` 32 | List feature weight. 33 | `, 34 | Action: featureweight.DoListFeatureWeight, 35 | Flags: []cli.Flag{ 36 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"}, 37 | }, 38 | }, 39 | }, 40 | } 41 | -------------------------------------------------------------------------------- /lib/diagnosis/feature_weight/feature_weight.go: -------------------------------------------------------------------------------- 1 | package featureweight 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | 7 | "github.com/syou6162/go-active-learning/lib/classifier" 8 | "github.com/syou6162/go-active-learning/lib/service" 9 | "github.com/syou6162/go-active-learning/lib/util" 10 | "github.com/syou6162/go-active-learning/lib/util/converter" 11 | "github.com/urfave/cli" 12 | ) 13 | 14 | type Feature struct { 15 | Key string 16 | Weight float64 17 | } 18 | 19 | type FeatureList []Feature 20 | 21 | func (p FeatureList) Len() int { return len(p) } 22 | func (p FeatureList) Less(i, j int) bool { return p[i].Weight < p[j].Weight } 23 | func (p FeatureList) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 24 | 25 | func DoListFeatureWeight(c *cli.Context) error { 26 | filterStatusCodeOk := c.Bool("filter-status-code-ok") 27 | 28 | app, err := service.NewDefaultApp() 29 | if err != nil { 30 | return err 31 | } 32 | defer app.Close() 33 | 34 | examples, err := app.SearchExamples() 35 | if err != nil { 36 | return err 37 | } 38 | app.Fetch(examples) 39 | for _, e := range examples { 40 | app.UpdateFeatureVector(e) 41 | } 42 | training := util.FilterLabeledExamples(examples) 43 | 44 | if filterStatusCodeOk { 45 | training = util.FilterStatusCodeOkExamples(training) 46 | } 47 | 48 | model, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(training)) 49 | if err != nil { 50 | return err 51 | } 52 | 53 | tmp := make(FeatureList, 0) 54 | for _, k := range model.GetActiveFeatures() { 55 | tmp = append(tmp, Feature{k, model.GetWeight(k)}) 56 | } 57 | sort.Sort(sort.Reverse(tmp)) 58 | 59 | for _, p := range tmp { 60 | fmt.Println(fmt.Sprintf("%+0.2f\t%s", p.Weight, p.Key)) 61 | } 62 | 63 | return nil 64 | } 65 | -------------------------------------------------------------------------------- /lib/diagnosis/feature_weight/feature_weight_test.go: -------------------------------------------------------------------------------- 1 | package featureweight_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/command" 7 | "github.com/syou6162/go-active-learning/lib/service" 8 | "github.com/syou6162/go-active-learning/lib/util/file" 9 | "github.com/urfave/cli" 10 | ) 11 | 12 | func TestDoListFeatureWeight(t *testing.T) { 13 | inputFilename := "../../../tech_input_example.txt" 14 | train, err := file.ReadExamples(inputFilename) 15 | if err != nil { 16 | t.Error(err) 17 | } 18 | 19 | a, err := service.NewDefaultApp() 20 | if err != nil { 21 | t.Error(err) 22 | } 23 | defer a.Close() 24 | 25 | if err = a.DeleteAllExamples(); err != nil { 26 | t.Error(err) 27 | } 28 | 29 | for _, example := range train { 30 | if err = a.UpdateOrCreateExample(example); err != nil { 31 | t.Error(err) 32 | } 33 | } 34 | 35 | app := cli.NewApp() 36 | app.Commands = command.Commands 37 | args := []string{ 38 | "go-active-learning", 39 | "diagnose", 40 | "feature-weight", 41 | "--filter-status-code-ok", 42 | } 43 | 44 | if err := app.Run(args); err != nil { 45 | t.Error(err) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /lib/diagnosis/label_conflict/label_conflict.go: -------------------------------------------------------------------------------- 1 | package labelconflict 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sort" 7 | "strconv" 8 | 9 | "encoding/csv" 10 | 11 | "github.com/syou6162/go-active-learning/lib/classifier" 12 | "github.com/syou6162/go-active-learning/lib/model" 13 | "github.com/syou6162/go-active-learning/lib/service" 14 | "github.com/syou6162/go-active-learning/lib/util" 15 | "github.com/syou6162/go-active-learning/lib/util/converter" 16 | "github.com/urfave/cli" 17 | ) 18 | 19 | func DoLabelConflict(c *cli.Context) error { 20 | filterStatusCodeOk := c.Bool("filter-status-code-ok") 21 | 22 | app, err := service.NewDefaultApp() 23 | if err != nil { 24 | return err 25 | } 26 | defer app.Close() 27 | 28 | examples, err := app.SearchExamples() 29 | if err != nil { 30 | return err 31 | } 32 | app.Fetch(examples) 33 | for _, e := range examples { 34 | app.UpdateFeatureVector(e) 35 | } 36 | training := util.FilterLabeledExamples(examples) 37 | 38 | if filterStatusCodeOk { 39 | training = util.FilterStatusCodeOkExamples(training) 40 | } 41 | 42 | m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(training)) 43 | if err != nil { 44 | return err 45 | } 46 | 47 | wrongExamples := model.Examples{} 48 | correctExamples := model.Examples{} 49 | 50 | for _, e := range training { 51 | e.Score = m.PredictScore(e.Fv) 52 | if float64(e.Label)*e.Score < 0 { 53 | wrongExamples = append(wrongExamples, e) 54 | } else { 55 | correctExamples = append(correctExamples, e) 56 | } 57 | } 58 | 59 | sort.Sort(sort.Reverse(wrongExamples)) 60 | sort.Sort(correctExamples) 61 | printResult(*m, correctExamples, wrongExamples) 62 | 63 | return nil 64 | } 65 | 66 | func printResult(m classifier.MIRAClassifier, correctExamples model.Examples, wrongExamples model.Examples) error { 67 | fmt.Println("Index\tLabel\tScore\tURL\tTitle") 68 | result := append(wrongExamples, correctExamples...) 69 | 70 | w := csv.NewWriter(os.Stdout) 71 | w.Comma = '\t' 72 | 73 | for idx, e := range result { 74 | record := []string{ 75 | strconv.Itoa(idx), 76 | strconv.Itoa(int(e.Label)), 77 | fmt.Sprintf("%0.03f", m.PredictScore(e.Fv)), 78 | e.Url, 79 | e.Title, 80 | } 81 | if err := w.Write(record); err != nil { 82 | return err 83 | } 84 | } 85 | 86 | w.Flush() 87 | if err := w.Error(); err != nil { 88 | return err 89 | } 90 | 91 | return nil 92 | } 93 | -------------------------------------------------------------------------------- /lib/diagnosis/label_conflict/label_conflict_test.go: -------------------------------------------------------------------------------- 1 | package labelconflict_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/command" 7 | "github.com/syou6162/go-active-learning/lib/service" 8 | "github.com/syou6162/go-active-learning/lib/util/file" 9 | "github.com/urfave/cli" 10 | ) 11 | 12 | func TestDoLabelConflict(t *testing.T) { 13 | inputFilename := "../../../tech_input_example.txt" 14 | train, err := file.ReadExamples(inputFilename) 15 | if err != nil { 16 | t.Error(err) 17 | } 18 | 19 | a, err := service.NewDefaultApp() 20 | if err != nil { 21 | t.Error(err) 22 | } 23 | defer a.Close() 24 | 25 | if err = a.DeleteAllExamples(); err != nil { 26 | t.Error(err) 27 | } 28 | 29 | for _, example := range train { 30 | if err = a.UpdateOrCreateExample(example); err != nil { 31 | t.Error(err) 32 | } 33 | } 34 | 35 | app := cli.NewApp() 36 | app.Commands = command.Commands 37 | args := []string{ 38 | "go-active-learning", 39 | "diagnose", 40 | "label-conflict", 41 | } 42 | 43 | if err := app.Run(args); err != nil { 44 | t.Error(err) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /lib/evaluation/evaluation.go: -------------------------------------------------------------------------------- 1 | package evaluation 2 | 3 | import ( 4 | "github.com/syou6162/go-active-learning/lib/model" 5 | ) 6 | 7 | func GetAccuracy(gold []model.LabelType, predict []model.LabelType) float64 { 8 | if len(gold) != len(predict) { 9 | return 0.0 10 | } 11 | sum := 0.0 12 | for i, v := range gold { 13 | if v == predict[i] { 14 | sum += 1.0 15 | } 16 | } 17 | return sum / float64(len(gold)) 18 | } 19 | 20 | func GetPrecision(gold []model.LabelType, predict []model.LabelType) float64 { 21 | tp := 0.0 22 | fp := 0.0 23 | for i, v := range gold { 24 | if v == model.POSITIVE && predict[i] == model.POSITIVE { 25 | tp += 1.0 26 | } 27 | if v == model.NEGATIVE && predict[i] == model.POSITIVE { 28 | fp += 1.0 29 | } 30 | } 31 | return tp / (tp + fp) 32 | } 33 | 34 | func GetRecall(gold []model.LabelType, predict []model.LabelType) float64 { 35 | tp := 0.0 36 | fn := 0.0 37 | for i, v := range gold { 38 | if v == model.POSITIVE && predict[i] == model.POSITIVE { 39 | tp += 1.0 40 | } 41 | if v == model.POSITIVE && predict[i] == model.NEGATIVE { 42 | fn += 1.0 43 | } 44 | } 45 | return tp / (tp + fn) 46 | } 47 | 48 | func GetConfusionMatrix(gold []model.LabelType, predict []model.LabelType) (int, int, int, int) { 49 | tp := 0 50 | fp := 0 51 | fn := 0 52 | tn := 0 53 | for i, v := range gold { 54 | if v == model.POSITIVE && predict[i] == model.POSITIVE { 55 | tp += 1 56 | } 57 | if v == model.NEGATIVE && predict[i] == model.POSITIVE { 58 | fp += 1 59 | } 60 | if v == model.POSITIVE && predict[i] == model.NEGATIVE { 61 | fn += 1 62 | } 63 | if v == model.NEGATIVE && predict[i] == model.NEGATIVE { 64 | tn += 1 65 | } 66 | } 67 | return tp, fp, fn, tn 68 | } 69 | -------------------------------------------------------------------------------- /lib/evaluation/evaluation_test.go: -------------------------------------------------------------------------------- 1 | package evaluation 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | ) 9 | 10 | func TestGetAccuracy(t *testing.T) { 11 | gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE} 12 | predict := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.POSITIVE} 13 | accuracy := 0.75 14 | 15 | if GetAccuracy(gold, predict) != accuracy { 16 | t.Error(fmt.Printf("Accuracy should be %f", accuracy)) 17 | } 18 | } 19 | 20 | func TestGetPrecision(t *testing.T) { 21 | gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE} 22 | predict := []model.LabelType{model.POSITIVE, model.NEGATIVE, model.NEGATIVE, model.POSITIVE} 23 | precision := 0.5 24 | 25 | if GetPrecision(gold, predict) != precision { 26 | t.Error(fmt.Printf("Precision should be %f", precision)) 27 | } 28 | } 29 | 30 | func TestGetRecall(t *testing.T) { 31 | gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE} 32 | predict := []model.LabelType{model.POSITIVE, model.NEGATIVE, model.NEGATIVE, model.POSITIVE} 33 | recall := 0.5 34 | 35 | if GetRecall(gold, predict) != recall { 36 | t.Error(fmt.Printf("Recall should be %f", recall)) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /lib/example/example.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/syou6162/go-active-learning/lib/feature" 7 | example_feature "github.com/syou6162/go-active-learning/lib/feature/example" 8 | "github.com/syou6162/go-active-learning/lib/model" 9 | ) 10 | 11 | func NewExample(url string, label model.LabelType) *model.Example { 12 | IsNew := false 13 | if label == model.UNLABELED { 14 | IsNew = true 15 | } 16 | now := time.Now() 17 | return &model.Example{ 18 | Label: label, 19 | Fv: feature.FeatureVector{}, 20 | Url: url, 21 | FinalUrl: url, 22 | Title: "", 23 | Description: "", 24 | OgDescription: "", 25 | OgType: "", 26 | OgImage: "", 27 | Body: "", 28 | Score: 0.0, 29 | IsNew: IsNew, 30 | StatusCode: 0, 31 | Favicon: "", 32 | ErrorCount: 0, 33 | CreatedAt: now, 34 | UpdatedAt: now, 35 | ReferringTweets: &model.ReferringTweets{}, 36 | HatenaBookmark: &model.HatenaBookmark{Bookmarks: make([]*model.Bookmark, 0)}, 37 | } 38 | } 39 | 40 | func GetStat(examples model.Examples) map[string]int { 41 | stat := make(map[string]int) 42 | for _, e := range examples { 43 | switch e.Label { 44 | case model.POSITIVE: 45 | stat["positive"]++ 46 | case model.NEGATIVE: 47 | stat["negative"]++ 48 | case model.UNLABELED: 49 | stat["unlabeled"]++ 50 | } 51 | } 52 | return stat 53 | } 54 | 55 | func ExtractFeatures(e model.Example) feature.FeatureVector { 56 | var fv feature.FeatureVector 57 | fv = append(fv, "BIAS") 58 | fv = append(fv, example_feature.ExtractHostFeature(e.FinalUrl)) 59 | fv = append(fv, example_feature.ExtractJpnNounFeatures(example_feature.ExtractPath(e.FinalUrl), "URL")...) 60 | fv = append(fv, example_feature.ExtractNounFeatures(e.Title, "TITLE")...) 61 | fv = append(fv, example_feature.ExtractNounFeatures(e.Description, "DESCRIPTION")...) 62 | fv = append(fv, example_feature.ExtractNounFeatures(e.Body, "BODY")...) 63 | return fv 64 | } 65 | -------------------------------------------------------------------------------- /lib/feature/example/example.go: -------------------------------------------------------------------------------- 1 | package example_feature 2 | 3 | import ( 4 | "net/url" 5 | "strings" 6 | "sync" 7 | "unicode" 8 | 9 | "github.com/ikawaha/kagome/tokenizer" 10 | "github.com/jdkato/prose/tag" 11 | "github.com/jdkato/prose/tokenize" 12 | "github.com/syou6162/go-active-learning/lib/feature" 13 | ) 14 | 15 | var excludingWordList = []string{ 16 | `:`, `;`, 17 | `,`, `.`, 18 | `"`, `''`, 19 | `+`, `-`, `*`, `/`, `|`, `++`, `--`, 20 | `[`, `]`, 21 | `{`, `}`, 22 | `(`, `)`, 23 | `<`, `>`, 24 | `「`, `」`, 25 | `/`, 26 | `@`, `#`, `~`, `%`, `$`, `^`, 27 | } 28 | 29 | var ( 30 | japaneseTokenizer *tokenizer.Tokenizer 31 | japaneseTokenizerOnce sync.Once 32 | englishTokenizer *tokenize.TreebankWordTokenizer 33 | englishTokenizerOnce sync.Once 34 | englishTagger *tag.PerceptronTagger 35 | englishTaggerOnce sync.Once 36 | excludingWordMapOnce sync.Once 37 | ) 38 | 39 | var excludingWordMap = make(map[string]bool) 40 | 41 | func GetJapaneseTokenizer() *tokenizer.Tokenizer { 42 | japaneseTokenizerOnce.Do(func() { 43 | t := tokenizer.New() 44 | japaneseTokenizer = &t 45 | }) 46 | 47 | return japaneseTokenizer 48 | } 49 | 50 | func GetEnglishTokenizer() *tokenize.TreebankWordTokenizer { 51 | englishTokenizerOnce.Do(func() { 52 | englishTokenizer = tokenize.NewTreebankWordTokenizer() 53 | }) 54 | return englishTokenizer 55 | } 56 | 57 | func GetEnglishTagger() *tag.PerceptronTagger { 58 | englishTaggerOnce.Do(func() { 59 | englishTagger = tag.NewPerceptronTagger() 60 | }) 61 | return englishTagger 62 | } 63 | 64 | func isJapanese(str string) bool { 65 | for _, r := range str { 66 | if unicode.In(r, unicode.Hiragana) || unicode.In(r, unicode.Katakana) || unicode.In(r, unicode.Han) { 67 | return true 68 | } 69 | } 70 | 71 | if strings.ContainsAny(str, "。、") { 72 | return true 73 | } 74 | 75 | return false 76 | } 77 | 78 | func IsExcludingWord(w string) bool { 79 | excludingWordMapOnce.Do(func() { 80 | for _, w := range excludingWordList { 81 | excludingWordMap[w] = true 82 | } 83 | }) 84 | if _, ok := excludingWordMap[w]; ok { 85 | return true 86 | } 87 | return false 88 | } 89 | 90 | func extractEngNounFeaturesWithoutPrefix(s string) feature.FeatureVector { 91 | var fv feature.FeatureVector 92 | if s == "" { 93 | return fv 94 | } 95 | 96 | words := GetEnglishTokenizer().Tokenize(s) 97 | tagger := GetEnglishTagger() 98 | for _, tok := range tagger.Tag(words) { 99 | if IsExcludingWord(tok.Text) { 100 | continue 101 | } 102 | switch tok.Tag { 103 | // https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html 104 | case "NN", "NNS", "NNP", "NNPS", "PRP", "PRP$": 105 | fv = append(fv, strings.ToLower(tok.Text)) 106 | } 107 | } 108 | 109 | return fv 110 | } 111 | 112 | func extractEngNounFeatures(s string, prefix string) feature.FeatureVector { 113 | var fv feature.FeatureVector 114 | for _, surface := range extractEngNounFeaturesWithoutPrefix(s) { 115 | fv = append(fv, prefix+":"+surface) 116 | } 117 | return fv 118 | } 119 | 120 | func ExtractJpnNounFeaturesWithoutPrefix(s string) feature.FeatureVector { 121 | var fv feature.FeatureVector 122 | if s == "" { 123 | return fv 124 | } 125 | t := GetJapaneseTokenizer() 126 | tokens := t.Tokenize(strings.ToLower(s)) 127 | for _, token := range tokens { 128 | if token.Pos() == "名詞" { 129 | surface := token.Surface 130 | if len(token.Features()) >= 2 && token.Features()[1] == "数" { 131 | surface = "NUM" 132 | } 133 | if IsExcludingWord(surface) { 134 | continue 135 | } 136 | fv = append(fv, surface) 137 | } 138 | } 139 | return fv 140 | } 141 | 142 | func ExtractJpnNounFeatures(s string, prefix string) feature.FeatureVector { 143 | var fv feature.FeatureVector 144 | for _, surface := range ExtractJpnNounFeaturesWithoutPrefix(s) { 145 | fv = append(fv, prefix+":"+surface) 146 | } 147 | return fv 148 | } 149 | 150 | func ExtractNounFeatures(s string, prefix string) feature.FeatureVector { 151 | if isJapanese(s) { 152 | return ExtractJpnNounFeatures(s, prefix) 153 | } else { 154 | return extractEngNounFeatures(s, prefix) 155 | } 156 | } 157 | 158 | func ExtractNounFeaturesWithoutPrefix(s string) feature.FeatureVector { 159 | if isJapanese(s) { 160 | return ExtractJpnNounFeaturesWithoutPrefix(s) 161 | } else { 162 | return extractEngNounFeaturesWithoutPrefix(s) 163 | } 164 | } 165 | 166 | func ExtractHostFeature(urlString string) string { 167 | prefix := "HOST" 168 | u, err := url.Parse(urlString) 169 | if err != nil { 170 | return prefix + ":INVALID_HOST" 171 | } 172 | return prefix + ":" + u.Host 173 | } 174 | 175 | func ExtractPath(urlString string) string { 176 | path := "" 177 | u, err := url.Parse(urlString) 178 | if err != nil { 179 | return path 180 | } 181 | return u.Path 182 | } 183 | -------------------------------------------------------------------------------- /lib/feature/example/example_test.go: -------------------------------------------------------------------------------- 1 | package example_feature 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestIsJapanese(t *testing.T) { 9 | text := "ほげ" 10 | if !isJapanese(text) { 11 | t.Error(fmt.Printf("%s should be Japanese", text)) 12 | } 13 | text = "文献紹介 / Youtube" 14 | if !isJapanese(text) { 15 | t.Error(fmt.Printf("%s should be Japanese", text)) 16 | } 17 | text = "This is a pen." 18 | if isJapanese(text) { 19 | t.Error(fmt.Printf("%s should be not Japanese", text)) 20 | } 21 | } 22 | 23 | func TestJapaneseNounFeatures(t *testing.T) { 24 | text := "日本語のテストです" 25 | fv := ExtractJpnNounFeaturesWithoutPrefix(text) 26 | if len(fv) != 2 { 27 | t.Error(fmt.Printf("Size of feature vector for %s should be 2, but %d", text, len(fv))) 28 | } 29 | text = "文献紹介 / Youtube" 30 | fv = ExtractJpnNounFeaturesWithoutPrefix(text) 31 | if len(fv) != 3 { 32 | t.Error(fmt.Printf("Size of feature vector for %s should be 3, but %d", text, len(fv))) 33 | } 34 | } 35 | 36 | func TestEngNounFeatures(t *testing.T) { 37 | text := "Hello World!" 38 | fv := extractEngNounFeatures(text, "") 39 | if len(fv) != 2 { 40 | t.Error(fmt.Printf("Size of feature vector for %s should be 2", text)) 41 | } 42 | } 43 | 44 | func TestExtractPath(t *testing.T) { 45 | url := "https://b.hatena.ne.jp/search/text?safe=on&q=nlp&users=50" 46 | path := "/search/text" 47 | if ExtractPath(url) != path { 48 | t.Error(fmt.Printf("path should be %s", path)) 49 | } 50 | } 51 | 52 | func TestExtractHostFeature(t *testing.T) { 53 | url := "https://b.hatena.ne.jp/search/text?safe=on&q=nlp&users=50" 54 | hostFeature := "HOST:b.hatena.ne.jp" 55 | if ExtractHostFeature(url) != hostFeature { 56 | t.Error(fmt.Printf("Host feature should be %s", hostFeature)) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /lib/feature/feature.go: -------------------------------------------------------------------------------- 1 | package feature 2 | 3 | import ( 4 | "encoding/json" 5 | ) 6 | 7 | type FeatureVector []string 8 | 9 | func (fv *FeatureVector) MarshalBinary() ([]byte, error) { 10 | json, err := json.Marshal(fv) 11 | if err != nil { 12 | return nil, err 13 | } 14 | return []byte(json), nil 15 | } 16 | 17 | func (fv *FeatureVector) UnmarshalBinary(data []byte) error { 18 | err := json.Unmarshal(data, fv) 19 | if err != nil { 20 | return err 21 | } 22 | return nil 23 | } 24 | -------------------------------------------------------------------------------- /lib/feature/tweet/tweet.go: -------------------------------------------------------------------------------- 1 | package tweet_feature 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | 7 | "github.com/syou6162/go-active-learning/lib/feature" 8 | "github.com/syou6162/go-active-learning/lib/model" 9 | "gopkg.in/vmarkovtsev/go-lcss.v1" 10 | ) 11 | 12 | type ExampleAndTweet struct { 13 | example *model.Example 14 | tweet *model.Tweet 15 | lcsLen int 16 | atMarksCnt int 17 | hashTagsCnt int 18 | cleanedText string 19 | cleanedLcsLen int 20 | } 21 | 22 | func (et *ExampleAndTweet) GetLabel() model.LabelType { 23 | return et.tweet.Label 24 | } 25 | 26 | func GetExampleAndTweet(e *model.Example, t *model.Tweet) ExampleAndTweet { 27 | result := ExampleAndTweet{example: e, tweet: t} 28 | result.lcsLen = GetLCSLen(e.Title, t.FullText) 29 | 30 | atRegexp := regexp.MustCompile(`@[^ ]+`) 31 | result.atMarksCnt = len(atRegexp.FindAllStringSubmatch(t.FullText, -1)) 32 | str := atRegexp.ReplaceAllString(t.FullText, "") 33 | hashRegexp := regexp.MustCompile(`#[^ ]+`) 34 | result.hashTagsCnt = len(hashRegexp.FindAllStringSubmatch(t.FullText, -1)) 35 | result.cleanedText = hashRegexp.ReplaceAllString(str, "") 36 | result.cleanedLcsLen = GetLCSLen(e.Title, result.cleanedText) 37 | return result 38 | } 39 | 40 | func GetLCSLen(str1 string, str2 string) int { 41 | return len(string(lcss.LongestCommonSubstring([]byte(str1), []byte(str2)))) 42 | } 43 | 44 | func LCSLenFeature(et ExampleAndTweet) string { 45 | prefix := "LCSLenFeature" 46 | len := et.lcsLen 47 | switch { 48 | case len == 0: 49 | return fmt.Sprintf("%s:0", prefix) 50 | case len < 5: 51 | return fmt.Sprintf("%s:5", prefix) 52 | case len < 10: 53 | return fmt.Sprintf("%s:10", prefix) 54 | case len < 25: 55 | return fmt.Sprintf("%s:25", prefix) 56 | case len < 50: 57 | return fmt.Sprintf("%s:50", prefix) 58 | case len < 100: 59 | return fmt.Sprintf("%s:100", prefix) 60 | default: 61 | return fmt.Sprintf("%s:INF", prefix) 62 | } 63 | } 64 | 65 | func CleanedLCSLenFeature(et ExampleAndTweet) string { 66 | prefix := "CleanedLCSLenFeature" 67 | len := et.cleanedLcsLen 68 | switch { 69 | case len == 0: 70 | return fmt.Sprintf("%s:0", prefix) 71 | case len < 5: 72 | return fmt.Sprintf("%s:5", prefix) 73 | case len < 10: 74 | return fmt.Sprintf("%s:10", prefix) 75 | case len < 25: 76 | return fmt.Sprintf("%s:25", prefix) 77 | case len < 50: 78 | return fmt.Sprintf("%s:50", prefix) 79 | case len < 100: 80 | return fmt.Sprintf("%s:100", prefix) 81 | default: 82 | return fmt.Sprintf("%s:INF", prefix) 83 | } 84 | } 85 | 86 | func LCSRatioFeature(et ExampleAndTweet) string { 87 | prefix := "LCSRatioFeature" 88 | ratio := float64(et.lcsLen) / float64(len(et.tweet.FullText)) 89 | switch { 90 | case ratio == 0.0: 91 | return fmt.Sprintf("%s:0.0", prefix) 92 | case ratio < 0.1: 93 | return fmt.Sprintf("%s:0.1", prefix) 94 | case ratio < 0.25: 95 | return fmt.Sprintf("%s:0.25", prefix) 96 | case ratio < 0.5: 97 | return fmt.Sprintf("%s:0.5", prefix) 98 | case ratio < 0.75: 99 | return fmt.Sprintf("%s:0.75", prefix) 100 | case ratio < 0.9: 101 | return fmt.Sprintf("%s:0.0", prefix) 102 | default: 103 | return fmt.Sprintf("%s:1.0", prefix) 104 | } 105 | } 106 | 107 | func CleanedLCSRatioFeature(et ExampleAndTweet) string { 108 | prefix := "CleanedLCSRatioFeature" 109 | ratio := float64(et.cleanedLcsLen) / float64(len(et.tweet.FullText)) 110 | switch { 111 | case ratio == 0.0: 112 | return fmt.Sprintf("%s:0.0", prefix) 113 | case ratio < 0.1: 114 | return fmt.Sprintf("%s:0.1", prefix) 115 | case ratio < 0.25: 116 | return fmt.Sprintf("%s:0.25", prefix) 117 | case ratio < 0.5: 118 | return fmt.Sprintf("%s:0.5", prefix) 119 | case ratio < 0.75: 120 | return fmt.Sprintf("%s:0.75", prefix) 121 | case ratio < 0.9: 122 | return fmt.Sprintf("%s:0.0", prefix) 123 | default: 124 | return fmt.Sprintf("%s:1.0", prefix) 125 | } 126 | } 127 | 128 | func FavoriteCountFeature(et ExampleAndTweet) string { 129 | prefix := "FavoriteCountFeature" 130 | cnt := et.tweet.FavoriteCount 131 | switch { 132 | case cnt == 0: 133 | return fmt.Sprintf("%s:0", prefix) 134 | case cnt == 1: 135 | return fmt.Sprintf("%s:1", prefix) 136 | case cnt <= 3: 137 | return fmt.Sprintf("%s:3", prefix) 138 | case cnt <= 5: 139 | return fmt.Sprintf("%s:5", prefix) 140 | case cnt <= 10: 141 | return fmt.Sprintf("%s:10", prefix) 142 | case cnt <= 25: 143 | return fmt.Sprintf("%s:25", prefix) 144 | case cnt <= 50: 145 | return fmt.Sprintf("%s:50", prefix) 146 | case cnt <= 100: 147 | return fmt.Sprintf("%s:100", prefix) 148 | default: 149 | return fmt.Sprintf("%s:INF", prefix) 150 | } 151 | } 152 | 153 | func RetweetCountFeature(et ExampleAndTweet) string { 154 | prefix := "RetweetCountFeature" 155 | cnt := et.tweet.RetweetCount 156 | switch { 157 | case cnt == 0: 158 | return fmt.Sprintf("%s:0", prefix) 159 | case cnt == 1: 160 | return fmt.Sprintf("%s:1", prefix) 161 | case cnt <= 3: 162 | return fmt.Sprintf("%s:3", prefix) 163 | case cnt <= 5: 164 | return fmt.Sprintf("%s:5", prefix) 165 | case cnt <= 10: 166 | return fmt.Sprintf("%s:10", prefix) 167 | case cnt <= 25: 168 | return fmt.Sprintf("%s:25", prefix) 169 | case cnt <= 50: 170 | return fmt.Sprintf("%s:50", prefix) 171 | case cnt <= 100: 172 | return fmt.Sprintf("%s:100", prefix) 173 | default: 174 | return fmt.Sprintf("%s:INF", prefix) 175 | } 176 | } 177 | 178 | func AtMarksCountFeature(et ExampleAndTweet) string { 179 | prefix := "AtMarksCountFeature" 180 | cnt := et.atMarksCnt 181 | switch { 182 | case cnt == 0: 183 | return fmt.Sprintf("%s:0", prefix) 184 | case cnt == 1: 185 | return fmt.Sprintf("%s:1", prefix) 186 | case cnt <= 3: 187 | return fmt.Sprintf("%s:3", prefix) 188 | case cnt <= 5: 189 | return fmt.Sprintf("%s:5", prefix) 190 | case cnt <= 10: 191 | return fmt.Sprintf("%s:10", prefix) 192 | default: 193 | return fmt.Sprintf("%s:INF", prefix) 194 | } 195 | } 196 | 197 | func HashTagsCountFeature(et ExampleAndTweet) string { 198 | prefix := "HashTagsCountFeature" 199 | cnt := et.atMarksCnt 200 | switch { 201 | case cnt == 0: 202 | return fmt.Sprintf("%s:0", prefix) 203 | case cnt == 1: 204 | return fmt.Sprintf("%s:1", prefix) 205 | case cnt <= 3: 206 | return fmt.Sprintf("%s:3", prefix) 207 | case cnt <= 5: 208 | return fmt.Sprintf("%s:5", prefix) 209 | case cnt <= 10: 210 | return fmt.Sprintf("%s:10", prefix) 211 | default: 212 | return fmt.Sprintf("%s:INF", prefix) 213 | } 214 | } 215 | 216 | func TextLengthFeature(et ExampleAndTweet) string { 217 | prefix := "TextLengthFeature" 218 | cnt := len(et.tweet.FullText) 219 | switch { 220 | case cnt == 0: 221 | return fmt.Sprintf("%s:0", prefix) 222 | case cnt == 1: 223 | return fmt.Sprintf("%s:1", prefix) 224 | case cnt == 3: 225 | return fmt.Sprintf("%s:3", prefix) 226 | case cnt < 5: 227 | return fmt.Sprintf("%s:5", prefix) 228 | case cnt < 10: 229 | return fmt.Sprintf("%s:10", prefix) 230 | case cnt < 25: 231 | return fmt.Sprintf("%s:25", prefix) 232 | case cnt < 50: 233 | return fmt.Sprintf("%s:50", prefix) 234 | case cnt < 100: 235 | return fmt.Sprintf("%s:100", prefix) 236 | default: 237 | return fmt.Sprintf("%s:INF", prefix) 238 | } 239 | } 240 | 241 | func CleanedTextLengthFeature(et ExampleAndTweet) string { 242 | prefix := "CleanedTextLengthFeature" 243 | cnt := len(et.cleanedText) 244 | switch { 245 | case cnt == 0: 246 | return fmt.Sprintf("%s:0", prefix) 247 | case cnt == 1: 248 | return fmt.Sprintf("%s:1", prefix) 249 | case cnt == 3: 250 | return fmt.Sprintf("%s:3", prefix) 251 | case cnt < 5: 252 | return fmt.Sprintf("%s:5", prefix) 253 | case cnt < 10: 254 | return fmt.Sprintf("%s:10", prefix) 255 | case cnt < 25: 256 | return fmt.Sprintf("%s:25", prefix) 257 | case cnt < 50: 258 | return fmt.Sprintf("%s:50", prefix) 259 | case cnt < 100: 260 | return fmt.Sprintf("%s:100", prefix) 261 | default: 262 | return fmt.Sprintf("%s:INF", prefix) 263 | } 264 | } 265 | 266 | func ScreenNameFeature(et ExampleAndTweet) string { 267 | prefix := "ScreenNameFeature" 268 | return fmt.Sprintf("%s:%s", prefix, et.tweet.ScreenName) 269 | } 270 | 271 | func (et *ExampleAndTweet) GetFeatureVector() feature.FeatureVector { 272 | var fv feature.FeatureVector 273 | 274 | fv = append(fv, "BIAS") 275 | fv = append(fv, LCSLenFeature(*et)) 276 | fv = append(fv, CleanedLCSLenFeature(*et)) 277 | fv = append(fv, LCSRatioFeature(*et)) 278 | fv = append(fv, CleanedLCSRatioFeature(*et)) 279 | fv = append(fv, TextLengthFeature(*et)) 280 | fv = append(fv, CleanedTextLengthFeature(*et)) 281 | 282 | fv = append(fv, ScreenNameFeature(*et)) 283 | fv = append(fv, FavoriteCountFeature(*et)) 284 | fv = append(fv, RetweetCountFeature(*et)) 285 | fv = append(fv, AtMarksCountFeature(*et)) 286 | fv = append(fv, HashTagsCountFeature(*et)) 287 | return fv 288 | } 289 | -------------------------------------------------------------------------------- /lib/feature/tweet/tweet_test.go: -------------------------------------------------------------------------------- 1 | package tweet_feature 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "github.com/syou6162/go-active-learning/lib/feature" 8 | "github.com/syou6162/go-active-learning/lib/model" 9 | ) 10 | 11 | func TestExtractHostFeature(t *testing.T) { 12 | e := model.Example{} 13 | e.Title = "Hello world" 14 | tweet := model.Tweet{} 15 | tweet.ScreenName = "syou6162" 16 | tweet.FullText = "Hello world @syou6162 @syou6163 #hashtag1 #hashtag2" 17 | tweet.FavoriteCount = 7 18 | tweet.RetweetCount = 7 19 | 20 | et := GetExampleAndTweet(&e, &tweet) 21 | fv := et.GetFeatureVector() 22 | expect := feature.FeatureVector{ 23 | "BIAS", 24 | "LCSLenFeature:25", 25 | "CleanedLCSLenFeature:25", 26 | "LCSRatioFeature:0.25", 27 | "CleanedLCSRatioFeature:0.25", 28 | "TextLengthFeature:100", 29 | "CleanedTextLengthFeature:25", 30 | "ScreenNameFeature:syou6162", 31 | "FavoriteCountFeature:10", 32 | "RetweetCountFeature:10", 33 | "AtMarksCountFeature:3", 34 | "HashTagsCountFeature:3", 35 | } 36 | if !reflect.DeepEqual(expect, fv) { 37 | t.Error("feature must be wrong") 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /lib/fetcher/fetcher.go: -------------------------------------------------------------------------------- 1 | package fetcher 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io/ioutil" 7 | "net/http" 8 | "regexp" 9 | "strings" 10 | "time" 11 | 12 | "net/url" 13 | "unicode/utf8" 14 | 15 | "github.com/PuerkitoBio/goquery" 16 | goose "github.com/syou6162/GoOse" 17 | ) 18 | 19 | type Article struct { 20 | Url string 21 | Title string 22 | Description string 23 | OgDescription string 24 | OgType string 25 | OgImage string 26 | Body string 27 | StatusCode int 28 | Favicon string 29 | PublishDate *time.Time 30 | } 31 | 32 | var articleFetcher = http.Client{ 33 | Transport: &http.Transport{ 34 | MaxIdleConns: 0, 35 | MaxIdleConnsPerHost: 100, 36 | }, 37 | Timeout: time.Duration(5 * time.Second), 38 | } 39 | 40 | func updateTitleIfArxiv(article *goose.Article, origUrl string, finalUrl string, html []byte) error { 41 | arxivUrl := "https://arxiv.org/abs/" 42 | if strings.Contains(origUrl, arxivUrl) || strings.Contains(finalUrl, arxivUrl) { 43 | // arxivのhtml内にはtitleタグが複数存在するので、丁寧にタイトルを取得する... 44 | re := regexp.MustCompile(`(.*?)`) 45 | m := re.FindSubmatch(html) 46 | if len(m) >= 2 { 47 | article.Title = string(m[1]) 48 | } 49 | } 50 | return nil 51 | } 52 | 53 | func updateMetaDescriptionIfArxiv(article *goose.Article, origUrl string, finalUrl string, html []byte) error { 54 | arxivUrl := "https://arxiv.org/abs/" 55 | if strings.Contains(origUrl, arxivUrl) || strings.Contains(finalUrl, arxivUrl) { 56 | // article.Docでもいけそうだが、gooseが中で書き換えていてダメ。Documentを作りなおす 57 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(html))) 58 | if err != nil { 59 | return err 60 | } 61 | article.MetaDescription = doc.Find(".abstract").Text() 62 | } 63 | return nil 64 | } 65 | 66 | func removeUtmParams(origUrl string) (string, error) { 67 | u, err := url.Parse(origUrl) 68 | if err != nil { 69 | return origUrl, err 70 | } 71 | 72 | q, err := url.ParseQuery(u.RawQuery) 73 | if err != nil { 74 | return origUrl, err 75 | } 76 | 77 | q.Del("utm_source") 78 | q.Del("utm_medium") 79 | q.Del("utm_campaign") 80 | q.Del("utm_term") 81 | q.Del("utm_content") 82 | 83 | q.Del("gi") 84 | 85 | u.RawQuery = q.Encode() 86 | 87 | return u.String(), nil 88 | } 89 | 90 | func GetArticle(origUrl string) (*Article, error) { 91 | g := goose.New() 92 | resp, err := articleFetcher.Get(origUrl) 93 | if err != nil { 94 | return nil, err 95 | } 96 | if resp.StatusCode == http.StatusFound || 97 | resp.StatusCode == http.StatusUnauthorized || 98 | resp.StatusCode == http.StatusForbidden || 99 | resp.StatusCode == http.StatusNotFound || 100 | resp.StatusCode == http.StatusGone || 101 | resp.StatusCode == http.StatusBadGateway || 102 | resp.StatusCode == http.StatusServiceUnavailable { 103 | return nil, errors.New(fmt.Sprintf("%s: Cannot fetch %s", resp.Status, origUrl)) 104 | } 105 | defer resp.Body.Close() 106 | 107 | html, err := ioutil.ReadAll(resp.Body) 108 | if err != nil { 109 | return nil, err 110 | } 111 | 112 | if !utf8.Valid(html) { 113 | return nil, errors.New(fmt.Sprintf("Invalid utf8 document: %s", origUrl)) 114 | } 115 | 116 | article, err := g.ExtractFromRawHTML(resp.Request.URL.String(), string(html)) 117 | if err != nil { 118 | return nil, err 119 | } 120 | 121 | finalUrl := article.CanonicalLink 122 | if finalUrl == "" { 123 | finalUrl = resp.Request.URL.String() 124 | } 125 | 126 | finalUrl, err = removeUtmParams(finalUrl) 127 | if err != nil { 128 | return nil, err 129 | } 130 | 131 | updateTitleIfArxiv(article, origUrl, finalUrl, html) 132 | updateMetaDescriptionIfArxiv(article, origUrl, finalUrl, html) 133 | 134 | favicon := "" 135 | if u, err := url.Parse(article.MetaFavicon); err == nil { 136 | if u.IsAbs() { 137 | favicon = article.MetaFavicon 138 | } 139 | } 140 | 141 | return &Article{ 142 | Url: finalUrl, 143 | Title: article.Title, 144 | Description: article.MetaDescription, 145 | OgDescription: article.MetaOgDescription, 146 | OgType: article.MetaOgType, 147 | OgImage: article.MetaOgImage, 148 | Body: article.CleanedText, 149 | StatusCode: resp.StatusCode, 150 | Favicon: favicon, 151 | PublishDate: article.PublishDate, 152 | }, nil 153 | } 154 | -------------------------------------------------------------------------------- /lib/fetcher/fetcher_test.go: -------------------------------------------------------------------------------- 1 | package fetcher 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestGetArticle(t *testing.T) { 9 | a, err := GetArticle("https://www.yasuhisay.info/entry/20090516/1242480413") 10 | if err != nil { 11 | t.Error(err.Error()) 12 | } 13 | 14 | if a.Title == "" { 15 | t.Error("Title must not be empty") 16 | } 17 | if a.Description == "" { 18 | t.Error("Description must not be empty") 19 | } 20 | if a.OgType != "article" { 21 | t.Error("OgType must be article") 22 | } 23 | if a.StatusCode != 200 { 24 | t.Error("StatusCode must be 200") 25 | } 26 | } 27 | 28 | func TestGetArticleARXIV(t *testing.T) { 29 | a, err := GetArticle("https://arxiv.org/abs/2012.07805") 30 | if err != nil { 31 | t.Error(err.Error()) 32 | } 33 | 34 | if a.Title != "[2012.07805] Extracting Training Data from Large Language Models" { 35 | t.Error("Title must not be empty") 36 | } 37 | if a.Description == "" { 38 | t.Error("Description must not be empty") 39 | } 40 | if a.StatusCode != 200 { 41 | t.Error("StatusCode must be 200") 42 | } 43 | } 44 | 45 | func TestGetArticleNotFound(t *testing.T) { 46 | _, err := GetArticle("https://www.yasuhisay.info/entry/NOT_FOUND") 47 | if err == nil { 48 | t.Error("Error should occur") 49 | } 50 | } 51 | 52 | func TestGetArticleWithInvalidEncoding(t *testing.T) { 53 | url := "http://www.atmarkit.co.jp/ait/articles/1702/20/news021.html" 54 | _, err := GetArticle(url) 55 | if err == nil { 56 | t.Error(fmt.Sprintf("Error must occur for this url: %s", url)) 57 | } 58 | } 59 | 60 | func TestRemoveUtmParams(t *testing.T) { 61 | before := "https://techplay.jp/event/698349?utm_source=event_698349" 62 | after, err := removeUtmParams(before) 63 | if err != nil { 64 | t.Error(fmt.Sprintf("Error must occur for this url: %s", before)) 65 | } 66 | expected := "https://techplay.jp/event/698349" 67 | if expected != after { 68 | t.Errorf("url should be %s, but %s", expected, after) 69 | } 70 | a, err := GetArticle(before) 71 | if expected != a.Url { 72 | t.Errorf("url should be %s, but %s", expected, a.Url) 73 | } 74 | } 75 | 76 | func TestFavicon(t *testing.T) { 77 | url := "https://www.yasuhisay.info/entry/2020/11/22/190000" 78 | a, err := GetArticle(url) 79 | if err != nil { 80 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url)) 81 | } 82 | expectedFaviconPath := "https://www.yasuhisay.info/icon/favicon" 83 | if expectedFaviconPath != a.Favicon { 84 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath) 85 | } 86 | 87 | url = "https://www.lifehacker.jp/2018/11/amazon-impact-absorption-case.html" 88 | a, err = GetArticle(url) 89 | if err != nil { 90 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url)) 91 | } 92 | expectedFaviconPath = "https://www.lifehacker.jp/assets/common/img/favicon.ico" 93 | if expectedFaviconPath != a.Favicon { 94 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath) 95 | } 96 | 97 | url = "https://peterroelants.github.io/" 98 | a, err = GetArticle(url) 99 | if err != nil { 100 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url)) 101 | } 102 | expectedFaviconPath = "https://peterroelants.github.io/images/favicon/apple-icon-57x57.png" 103 | if expectedFaviconPath != a.Favicon { 104 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath) 105 | } 106 | 107 | url = "https://www.getrevue.co/profile/icoxfog417/issues/weekly-machine-learning-79-121292" 108 | a, err = GetArticle(url) 109 | if err != nil { 110 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url)) 111 | } 112 | expectedFaviconPath = "https://d3jbm9h03wxzi9.cloudfront.net/assets/favicon-84fc7f228d52c2410eb7aa839e279caeaa491588c7c75229ed33e1c7f69fe75d.ico" 113 | if expectedFaviconPath != a.Favicon { 114 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath) 115 | } 116 | 117 | url = "https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html" 118 | a, err = GetArticle(url) 119 | if err != nil { 120 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url)) 121 | } 122 | expectedFaviconPath = "https://ai.googleblog.com/favicon.ico" 123 | if expectedFaviconPath != a.Favicon { 124 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath) 125 | } 126 | } 127 | 128 | func TestGetPublishDate(t *testing.T) { 129 | a, err := GetArticle("https://www.yasuhisay.info/entry/2019/11/18/153000") 130 | if err != nil { 131 | t.Error("Error should not occur") 132 | } 133 | if a.PublishDate == nil { 134 | t.Error("PublishDate must not be nil") 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /lib/hatena_bookmark/hatena_bookmark.go: -------------------------------------------------------------------------------- 1 | package hatena_bookmark 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "net/http" 8 | 9 | "github.com/syou6162/go-active-learning/lib/model" 10 | ) 11 | 12 | func GetHatenaBookmark(url string) (*model.HatenaBookmark, error) { 13 | // ref: http://developer.hatena.ne.jp/ja/documents/bookmark/apis/getinfo 14 | res, err := http.Get(fmt.Sprintf("https://b.hatena.ne.jp/entry/jsonlite/?url=%s", url)) 15 | if err != nil { 16 | return nil, err 17 | } 18 | if res.StatusCode != http.StatusOK { 19 | return nil, fmt.Errorf("error: %d", res.StatusCode) 20 | } 21 | 22 | defer res.Body.Close() 23 | body, error := ioutil.ReadAll(res.Body) 24 | if error != nil { 25 | return nil, err 26 | } 27 | 28 | bookmarks := model.HatenaBookmark{} 29 | err = json.Unmarshal(body, &bookmarks) 30 | if error != nil { 31 | return nil, err 32 | } 33 | return &bookmarks, nil 34 | } 35 | -------------------------------------------------------------------------------- /lib/hatena_bookmark/hatena_bookmark_test.go: -------------------------------------------------------------------------------- 1 | package hatena_bookmark 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestGetHatenaBookmark(t *testing.T) { 8 | bookmarks, err := GetHatenaBookmark("https://www.yasuhisay.info") 9 | if err != nil { 10 | t.Error(err.Error()) 11 | } 12 | 13 | if bookmarks.Title == "" { 14 | t.Error("Title must not be empty") 15 | } 16 | if bookmarks.Count == 0 { 17 | t.Error("Count must not be 0") 18 | } 19 | if len(bookmarks.Bookmarks) == 0 { 20 | t.Error("Count must not be 0") 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lib/model/error.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | type notFoundError string 4 | 5 | func (err notFoundError) Error() string { 6 | return string(err) + " not found" 7 | } 8 | 9 | func NotFoundError(typ string) error { 10 | return notFoundError(typ) 11 | } 12 | 13 | func IsNotFound(err error) bool { 14 | _, ok := err.(notFoundError) 15 | return ok 16 | } 17 | -------------------------------------------------------------------------------- /lib/model/example.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "math" 5 | "strings" 6 | "time" 7 | 8 | "github.com/syou6162/go-active-learning/lib/feature" 9 | ) 10 | 11 | type Example struct { 12 | Id int `db:"id"` 13 | Label LabelType `json:"Label" db:"label"` 14 | Fv feature.FeatureVector 15 | Url string `json:"Url" db:"url"` 16 | FinalUrl string `json:"FinalUrl" db:"final_url"` 17 | Title string `json:"Title" db:"title"` 18 | Description string `json:"Description" db:"description"` 19 | OgDescription string `json:"OgDescription" db:"og_description"` 20 | OgType string `json:"OgType" db:"og_type"` 21 | OgImage string `json:"OgImage" db:"og_image"` 22 | Body string `json:"Body" db:"body"` 23 | Score float64 `db:"score"` 24 | IsNew bool `db:"is_new"` 25 | StatusCode int `json:"StatusCode" db:"status_code"` 26 | Favicon string `json:"Favicon" db:"favicon"` 27 | ErrorCount int `json:"ErrorCount" db:"error_count"` 28 | CreatedAt time.Time `json:"CreatedAt" db:"created_at"` 29 | UpdatedAt time.Time `json:"UpdatedAt" db:"updated_at"` 30 | ReferringTweets *ReferringTweets `json:"ReferringTweets"` 31 | HatenaBookmark *HatenaBookmark `json:"HatenaBookmark"` 32 | } 33 | 34 | type Examples []*Example 35 | 36 | func (example *Example) GetLabel() LabelType { 37 | return example.Label 38 | } 39 | 40 | func (example *Example) GetFeatureVector() feature.FeatureVector { 41 | return example.Fv 42 | } 43 | 44 | func (example *Example) Annotate(label LabelType) { 45 | example.Label = label 46 | } 47 | 48 | func (example *Example) IsLabeled() bool { 49 | return example.Label != UNLABELED 50 | } 51 | 52 | func (example *Example) IsTwitterUrl() bool { 53 | twitterUrl := "https://twitter.com" 54 | return strings.Contains(example.Url, twitterUrl) || strings.Contains(example.FinalUrl, twitterUrl) 55 | } 56 | 57 | func (example *Example) IsArticle() bool { 58 | // twitterはarticleと返ってくるが除外 59 | return example.OgType == "article" && !example.IsTwitterUrl() 60 | } 61 | 62 | func (slice Examples) Len() int { 63 | return len(slice) 64 | } 65 | 66 | func (slice Examples) Less(i, j int) bool { 67 | return math.Abs(slice[i].Score) < math.Abs(slice[j].Score) 68 | } 69 | 70 | func (slice Examples) Swap(i, j int) { 71 | slice[i], slice[j] = slice[j], slice[i] 72 | } 73 | -------------------------------------------------------------------------------- /lib/model/hatena_bookmark.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "database/sql/driver" 5 | "encoding/json" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | type Tags []string 11 | 12 | type HatenaBookmarkTime struct { 13 | *time.Time 14 | } 15 | 16 | // ref: https://dev.classmethod.jp/go/struct-json/ 17 | func (hbt *HatenaBookmarkTime) UnmarshalJSON(data []byte) error { 18 | t, err := time.Parse("\"2006/01/02 15:04\"", string(data)) 19 | *hbt = HatenaBookmarkTime{&t} 20 | return err 21 | } 22 | 23 | func (hbt HatenaBookmarkTime) MarshalJSON() ([]byte, error) { 24 | return json.Marshal(hbt.Format("2006/01/02 15:04")) 25 | } 26 | 27 | // ref: https://qiita.com/roothybrid7/items/52623bedb45ff0c26a8a 28 | func (hbt *HatenaBookmarkTime) Scan(value interface{}) error { 29 | v := value.(time.Time) 30 | hbt.Time = &v 31 | return nil 32 | } 33 | 34 | func (hbt HatenaBookmarkTime) Value() (driver.Value, error) { 35 | return *hbt.Time, nil 36 | } 37 | 38 | func (tags *Tags) Scan(value interface{}) error { 39 | s := value.(string) 40 | if s == "" { 41 | *tags = Tags{} 42 | return nil 43 | } 44 | v := strings.Split(s, "\t") 45 | *tags = append(*tags, v...) 46 | return nil 47 | } 48 | 49 | func (tags Tags) Value() (driver.Value, error) { 50 | return strings.Join(tags, "\t"), nil 51 | } 52 | 53 | type Bookmark struct { 54 | HatenaBookmarkId int `db:"hatena_bookmark_id"` 55 | Timestamp HatenaBookmarkTime `json:"timestamp" db:"timestamp"` 56 | User string `json:"user" db:"user"` 57 | Tags Tags `json:"tags" db:"tags"` 58 | Comment string `json:"comment" db:"comment"` 59 | } 60 | 61 | type HatenaBookmark struct { 62 | Id int `db:"id"` 63 | ExampleId int `db:"example_id"` 64 | Title string `json:"title" db:"title"` 65 | Bookmarks []*Bookmark `json:"bookmarks"` 66 | Screenshot string `json:"screenshot" db:"screenshot"` 67 | EntryUrl string `json:"entry_url" db:"entry_url"` 68 | Count int `json:"count" db:"count"` 69 | Url string `json:"url" db:"url"` 70 | EId string `json:"eid" db:"eid"` 71 | } 72 | 73 | func (bookmarks *HatenaBookmark) MarshalBinary() ([]byte, error) { 74 | json, err := json.Marshal(bookmarks) 75 | if err != nil { 76 | return nil, err 77 | } 78 | return []byte(json), nil 79 | } 80 | 81 | func (bookmarks *HatenaBookmark) UnmarshalBinary(data []byte) error { 82 | err := json.Unmarshal(data, bookmarks) 83 | if err != nil { 84 | return err 85 | } 86 | return nil 87 | } 88 | -------------------------------------------------------------------------------- /lib/model/label_type.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "encoding/json" 5 | ) 6 | 7 | type LabelType int 8 | 9 | func (lt *LabelType) MarshalBinary() ([]byte, error) { 10 | return json.Marshal(lt) 11 | } 12 | 13 | func (lt *LabelType) UnmarshalBinary(data []byte) error { 14 | if err := json.Unmarshal(data, <); err != nil { 15 | return err 16 | } 17 | return nil 18 | } 19 | 20 | const ( 21 | POSITIVE LabelType = 1 22 | NEGATIVE LabelType = -1 23 | UNLABELED LabelType = 0 24 | ) 25 | -------------------------------------------------------------------------------- /lib/model/recommendation.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import "fmt" 4 | 5 | type RecommendationListType int 6 | 7 | const ( 8 | GENERAL RecommendationListType = 0 9 | ARTICLE RecommendationListType = 1 10 | GITHUB RecommendationListType = 2 11 | SLIDE RecommendationListType = 3 12 | ARXIV RecommendationListType = 4 13 | VIDEO RecommendationListType = 5 14 | EVENT RecommendationListType = 6 15 | ) 16 | 17 | func GetRecommendationListType(listname string) (RecommendationListType, error) { 18 | switch listname { 19 | case "general": 20 | return GENERAL, nil 21 | case "article": 22 | return ARTICLE, nil 23 | case "github": 24 | return GITHUB, nil 25 | case "slide": 26 | return SLIDE, nil 27 | case "arxiv": 28 | return ARXIV, nil 29 | case "video": 30 | return VIDEO, nil 31 | case "event": 32 | return EVENT, nil 33 | default: 34 | return -1, fmt.Errorf("no such RecommendationListType for '%s'", listname) 35 | } 36 | } 37 | 38 | type Recommendation struct { 39 | RecommendationListType RecommendationListType 40 | ExampleIds []int 41 | } 42 | -------------------------------------------------------------------------------- /lib/model/related_example.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | type RelatedExamples struct { 4 | ExampleId int 5 | RelatedExampleIds []int 6 | } 7 | -------------------------------------------------------------------------------- /lib/model/tweet.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type Tweet struct { 8 | Id int `db:"id"` 9 | ExampleId int `db:"example_id"` 10 | 11 | CreatedAt time.Time `json:"CreatedAt" db:"created_at"` 12 | IdStr string `json:"IdStr" db:"id_str"` 13 | FullText string `json:"FullText" db:"full_text"` 14 | FavoriteCount int `json:"FavoriteCount" db:"favorite_count"` 15 | RetweetCount int `json:"RetweetCount" db:"retweet_count"` 16 | Lang string `json:"Lang" db:"lang"` 17 | 18 | ScreenName string `json:"ScreenName" db:"screen_name"` 19 | Name string `json:"Name" db:"name"` 20 | ProfileImageUrl string `json:"ProfileImageUrl" db:"profile_image_url"` 21 | Label LabelType `json:"Label" db:"label"` 22 | Score float64 `json:"Score" db:"score"` 23 | } 24 | 25 | type ReferringTweets struct { 26 | Count int `json:"Count"` 27 | Tweets []*Tweet `json:"Tweets"` 28 | } 29 | -------------------------------------------------------------------------------- /lib/related_example/related_example.go: -------------------------------------------------------------------------------- 1 | package related_example 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "strconv" 7 | "strings" 8 | 9 | "os" 10 | 11 | "github.com/syou6162/go-active-learning/lib/model" 12 | "github.com/syou6162/go-active-learning/lib/service" 13 | "github.com/urfave/cli" 14 | ) 15 | 16 | func parseLine(line string) (int, int, error) { 17 | tokens := strings.Split(line, "\t") 18 | if len(tokens) == 2 { 19 | exampleId, _ := strconv.ParseInt(tokens[0], 10, 0) 20 | relatedExampleId, _ := strconv.ParseInt(tokens[1], 10, 0) 21 | return int(exampleId), int(relatedExampleId), nil 22 | } 23 | return 0, 0, fmt.Errorf("Invalid line: %s", line) 24 | } 25 | 26 | func readRelatedExamples(filename string) ([]*model.RelatedExamples, error) { 27 | fp, err := os.Open(filename) 28 | defer fp.Close() 29 | if err != nil { 30 | return nil, err 31 | } 32 | 33 | exampleId2RelatedExampleIds := make(map[int][]int) 34 | scanner := bufio.NewScanner(fp) 35 | for scanner.Scan() { 36 | line := scanner.Text() 37 | exampleId, relatedExampleId, err := parseLine(line) 38 | if err != nil { 39 | return nil, err 40 | } 41 | if _, ok := exampleId2RelatedExampleIds[exampleId]; ok { 42 | exampleId2RelatedExampleIds[exampleId] = append(exampleId2RelatedExampleIds[exampleId], relatedExampleId) 43 | } else { 44 | exampleId2RelatedExampleIds[exampleId] = []int{relatedExampleId} 45 | } 46 | } 47 | if err := scanner.Err(); err != nil { 48 | return nil, err 49 | } 50 | result := make([]*model.RelatedExamples, 0) 51 | for exampleId, relatedExampleIds := range exampleId2RelatedExampleIds { 52 | result = append(result, &model.RelatedExamples{ExampleId: exampleId, RelatedExampleIds: relatedExampleIds}) 53 | } 54 | return result, nil 55 | } 56 | 57 | func doAddRelatedExamples(c *cli.Context) error { 58 | inputFilename := c.String("input-filename") 59 | 60 | if inputFilename == "" { 61 | _ = cli.ShowCommandHelp(c, "add-related-examples") 62 | return cli.NewExitError("`input-filename` is a required field.", 1) 63 | } 64 | 65 | app, err := service.NewDefaultApp() 66 | if err != nil { 67 | return err 68 | } 69 | defer app.Close() 70 | 71 | relatedExamplesList, err := readRelatedExamples(inputFilename) 72 | if err != nil { 73 | return err 74 | } 75 | for _, relatedExamples := range relatedExamplesList { 76 | for _, related := range relatedExamples.RelatedExampleIds { 77 | fmt.Print(relatedExamples.ExampleId) 78 | fmt.Print("\t") 79 | fmt.Println(related) 80 | } 81 | err := app.UpdateRelatedExamples(*relatedExamples) 82 | if err != nil { 83 | return err 84 | } 85 | } 86 | return nil 87 | } 88 | 89 | var CommandAddRelatedExamples = cli.Command{ 90 | Name: "add-related-examples", 91 | Usage: "add related examples", 92 | Description: ` 93 | Add related examples. 94 | `, 95 | Action: doAddRelatedExamples, 96 | Flags: []cli.Flag{ 97 | cli.StringFlag{Name: "input-filename"}, 98 | }, 99 | } 100 | -------------------------------------------------------------------------------- /lib/repository/example.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "bufio" 5 | "database/sql" 6 | "fmt" 7 | "io" 8 | "time" 9 | 10 | "github.com/lib/pq" 11 | "github.com/syou6162/go-active-learning/lib/feature" 12 | "github.com/syou6162/go-active-learning/lib/model" 13 | "github.com/syou6162/go-active-learning/lib/util/file" 14 | ) 15 | 16 | var exampleNotFoundError = model.NotFoundError("example") 17 | 18 | // データが存在しなければ追加 19 | // データが存在する場合は、以下の場合にのみ更新する 20 | // - ラベルが正例か負例に変更された 21 | // - クロール対象のサイトが一時的に200以外のステータスで前回データが取得できなかった 22 | func (r *repository) UpdateOrCreateExample(e *model.Example) error { 23 | now := time.Now() 24 | e.UpdatedAt = now 25 | _, err := r.db.NamedExec(` 26 | INSERT INTO example 27 | ( url, final_url, title, description, og_description, og_type, og_image, body, score, is_new, status_code, favicon, label, created_at, updated_at) 28 | VALUES 29 | (:url, :final_url, :title, :description, :og_description, :og_type, :og_image, :body, :score, :is_new, :status_code, :favicon, :label, :created_at, :updated_at) 30 | ON CONFLICT (url) 31 | DO UPDATE SET 32 | url = :url, final_url = :final_url, title = :title, 33 | description = :description, og_description = :og_description, og_type = :og_type, og_image = :og_image, 34 | body = :body, score = :score, is_new = :is_new, status_code = :status_code, favicon = :favicon, 35 | label = :label, created_at = :created_at, updated_at = :updated_at 36 | WHERE 37 | ((EXCLUDED.label != 0) AND (example.label != EXCLUDED.label)) OR 38 | ((example.status_code != 200) AND (EXCLUDED.status_code = 200)) 39 | ;`, e) 40 | if err != nil { 41 | return err 42 | } 43 | tmp, err := r.FindExampleByUlr(e.Url) 44 | if err != nil { 45 | return err 46 | } 47 | e.Id = tmp.Id 48 | return nil 49 | } 50 | 51 | func (r *repository) UpdateScore(e *model.Example) error { 52 | if _, err := r.FindExampleByUlr(e.Url); err != nil { 53 | return err 54 | } 55 | if _, err := r.db.Exec(`UPDATE example SET score = $1, updated_at = $2 WHERE url = $3;`, e.Score, time.Now(), e.Url); err != nil { 56 | return err 57 | } 58 | return nil 59 | } 60 | 61 | func (r *repository) IncErrorCount(e *model.Example) error { 62 | errorCount, err := r.GetErrorCount(e) 63 | if err != nil { 64 | return err 65 | } 66 | if _, err := r.db.Exec(`UPDATE example SET error_count = $1, updated_at = $2 WHERE url = $3;`, errorCount+1, time.Now(), e.Url); err != nil { 67 | return err 68 | } 69 | return nil 70 | } 71 | 72 | func (r *repository) GetErrorCount(e *model.Example) (int, error) { 73 | example, err := r.FindExampleByUlr(e.Url) 74 | if err != nil { 75 | if err == exampleNotFoundError { 76 | return 0, nil 77 | } 78 | return 0, err 79 | } 80 | return example.ErrorCount, nil 81 | } 82 | 83 | func (r *repository) UpdateFeatureVector(e *model.Example) error { 84 | tmp, err := r.FindExampleByUlr(e.Url) 85 | if err != nil { 86 | return err 87 | } 88 | id := tmp.Id 89 | if _, err = r.db.Exec(`DELETE FROM feature WHERE example_id = $1;`, id); err != nil { 90 | return err 91 | } 92 | _, err = r.db.Exec(`INSERT INTO feature (example_id, feature) VALUES ($1, unnest(cast($2 AS TEXT[])));`, id, pq.Array(e.Fv)) 93 | return err 94 | } 95 | 96 | func (r *repository) InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) { 97 | line := scanner.Text() 98 | e, err := file.ParseLine(line) 99 | if err != nil { 100 | return nil, err 101 | } 102 | err = r.UpdateOrCreateExample(e) 103 | if err != nil { 104 | return nil, err 105 | } 106 | return e, nil 107 | } 108 | 109 | func (r *repository) InsertExamplesFromReader(reader io.Reader) error { 110 | scanner := bufio.NewScanner(reader) 111 | 112 | for scanner.Scan() { 113 | _, err := r.InsertExampleFromScanner(scanner) 114 | if err != nil { 115 | return err 116 | } 117 | } 118 | if err := scanner.Err(); err != nil { 119 | return err 120 | } 121 | return nil 122 | } 123 | 124 | func (r *repository) searchExamples(query string, args ...interface{}) (model.Examples, error) { 125 | examples := model.Examples{} 126 | err := r.db.Select(&examples, query, args...) 127 | if err != nil { 128 | return nil, err 129 | } 130 | return examples, nil 131 | } 132 | 133 | func (r *repository) findExample(query string, args ...interface{}) (*model.Example, error) { 134 | e := model.Example{} 135 | 136 | err := r.db.Get(&e, query, args...) 137 | if err != nil { 138 | if err == sql.ErrNoRows { 139 | return nil, exampleNotFoundError 140 | } 141 | return nil, err 142 | } 143 | return &e, nil 144 | } 145 | 146 | func (r *repository) SearchExamples() (model.Examples, error) { 147 | query := `SELECT * FROM example;` 148 | return r.searchExamples(query) 149 | } 150 | 151 | func (r *repository) SearchRecentExamples(from time.Time, limit int) (model.Examples, error) { 152 | query := `SELECT * FROM example WHERE created_at > $1 ORDER BY updated_at DESC LIMIT $2;` 153 | return r.searchExamples(query, from, limit) 154 | } 155 | 156 | func (r *repository) SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) { 157 | query := `SELECT * FROM example WHERE final_url like $1 || '%' AND created_at > $2 ORDER BY updated_at DESC LIMIT $3;` 158 | return r.searchExamples(query, host, from, limit) 159 | } 160 | 161 | func (r *repository) SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) { 162 | query := `SELECT * FROM example WHERE label = $1 ORDER BY updated_at DESC LIMIT $2;` 163 | return r.searchExamples(query, label, limit) 164 | } 165 | 166 | func (r *repository) SearchLabeledExamples(limit int) (model.Examples, error) { 167 | query := `SELECT * FROM example WHERE label != 0 ORDER BY updated_at DESC LIMIT $1;` 168 | return r.searchExamples(query, limit) 169 | } 170 | 171 | func (r *repository) SearchPositiveExamples(limit int) (model.Examples, error) { 172 | return r.SearchExamplesByLabel(model.POSITIVE, limit) 173 | } 174 | 175 | func (r *repository) SearchNegativeExamples(limit int) (model.Examples, error) { 176 | return r.SearchExamplesByLabel(model.NEGATIVE, limit) 177 | } 178 | 179 | func (r *repository) SearchUnlabeledExamples(limit int) (model.Examples, error) { 180 | return r.SearchExamplesByLabel(model.UNLABELED, limit) 181 | } 182 | 183 | func (r *repository) SearchPositiveScoredExamples(limit int) (model.Examples, error) { 184 | query := `SELECT * FROM example WHERE score > 0 ORDER BY updated_at DESC LIMIT $1;` 185 | return r.searchExamples(query, limit) 186 | } 187 | 188 | func (r *repository) FindExampleByUlr(url string) (*model.Example, error) { 189 | query := `SELECT * FROM example WHERE url = $1;` 190 | return r.findExample(query, url) 191 | } 192 | 193 | // bodyなどは極めて長くなりえるので、DB側で絞って返すことができるようにする 194 | func buildSelectQuery(useTruncatedField bool) string { 195 | title := "title" 196 | description := "description" 197 | ogDescription := "og_description" 198 | body := "body" 199 | 200 | if useTruncatedField { 201 | title = "LEFT(title, 200) AS title" 202 | description = "LEFT(description, 1000) AS description" 203 | ogDescription = "LEFT(og_description, 1000) AS og_description" 204 | body = "LEFT(body, 1000) AS body" 205 | } 206 | return fmt.Sprintf("SELECT id, label, url, final_url, %s, %s, %s, og_type, og_image, %s, score, is_new, status_code, favicon, error_count, created_at, updated_at", title, description, ogDescription, body) 207 | } 208 | 209 | func (r *repository) FindExampleById(id int) (*model.Example, error) { 210 | query := fmt.Sprintf(`%s FROM example WHERE id = $1;`, buildSelectQuery(true)) 211 | return r.findExample(query, id) 212 | } 213 | 214 | func (r *repository) SearchExamplesByUlrs(urls []string) (model.Examples, error) { 215 | // ref: https://godoc.org/github.com/lib/pq#Array 216 | query := `SELECT * FROM example WHERE url = ANY($1);` 217 | return r.searchExamples(query, pq.Array(urls)) 218 | } 219 | 220 | func (r *repository) SearchExamplesByIds(ids []int) (model.Examples, error) { 221 | if len(ids) == 0 { 222 | return model.Examples{}, nil 223 | } 224 | query := fmt.Sprintf(`%s FROM example WHERE id = ANY($1);`, buildSelectQuery(true)) 225 | return r.searchExamples(query, pq.Array(ids)) 226 | } 227 | 228 | func (r *repository) SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) { 229 | if len(keywords) == 0 { 230 | return model.Examples{}, nil 231 | } 232 | regexList := make([]string, 0) 233 | for _, w := range keywords { 234 | regexList = append(regexList, fmt.Sprintf(`.*%s.*`, w)) 235 | } 236 | query := fmt.Sprintf(`%s FROM example WHERE title ~* %s($1) AND label != -1 ORDER BY (label, score) DESC LIMIT $2;`, buildSelectQuery(true), aggregator) 237 | return r.searchExamples(query, pq.Array(regexList), limit) 238 | } 239 | 240 | func (r *repository) countExamplesByLabel(label model.LabelType) (int, error) { 241 | cnt := 0 242 | err := r.db.Get(&cnt, `SELECT COUNT(*) FROM example WHERE label = $1`, label) 243 | if err != nil { 244 | return 0, err 245 | } 246 | return cnt, nil 247 | } 248 | 249 | func (r *repository) CountPositiveExamples() (int, error) { 250 | return r.countExamplesByLabel(model.POSITIVE) 251 | } 252 | 253 | func (r *repository) CountNegativeExamples() (int, error) { 254 | return r.countExamplesByLabel(model.NEGATIVE) 255 | } 256 | 257 | func (r *repository) CountUnlabeledExamples() (int, error) { 258 | return r.countExamplesByLabel(model.UNLABELED) 259 | } 260 | 261 | func (r *repository) FindFeatureVector(e *model.Example) (feature.FeatureVector, error) { 262 | fv := feature.FeatureVector{} 263 | tmp, err := r.FindExampleByUlr(e.Url) 264 | if err != nil { 265 | return fv, err 266 | } 267 | id := tmp.Id 268 | query := `SELECT feature FROM feature WHERE example_id = $1;` 269 | err = r.db.Select(&fv, query, id) 270 | if err != nil { 271 | return fv, err 272 | } 273 | return fv, nil 274 | } 275 | 276 | func (r *repository) SearchFeatureVector(examples model.Examples) (map[int]feature.FeatureVector, error) { 277 | type Pair struct { 278 | ExampleId int `db:"example_id"` 279 | Feature string `db:"feature"` 280 | } 281 | 282 | fvById := make(map[int]feature.FeatureVector) 283 | urls := make([]string, 0) 284 | for _, e := range examples { 285 | urls = append(urls, e.Url) 286 | } 287 | 288 | tmp, err := r.SearchExamplesByUlrs(urls) 289 | if err != nil { 290 | return fvById, err 291 | } 292 | ids := make([]int, 0) 293 | for _, e := range tmp { 294 | ids = append(ids, e.Id) 295 | } 296 | 297 | query := `SELECT example_id, feature FROM feature WHERE example_id = ANY($1);` 298 | pairs := make([]Pair, 0) 299 | err = r.db.Select(&pairs, query, pq.Array(ids)) 300 | if err != nil { 301 | return fvById, err 302 | } 303 | 304 | for _, pair := range pairs { 305 | fvById[pair.ExampleId] = append(fvById[pair.ExampleId], pair.Feature) 306 | } 307 | return fvById, nil 308 | } 309 | 310 | func (r *repository) DeleteAllExamples() error { 311 | _, err := r.db.Exec(`DELETE FROM example;`) 312 | return err 313 | } 314 | -------------------------------------------------------------------------------- /lib/repository/example_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "testing" 7 | "time" 8 | 9 | "github.com/syou6162/go-active-learning/lib/example" 10 | "github.com/syou6162/go-active-learning/lib/feature" 11 | "github.com/syou6162/go-active-learning/lib/model" 12 | "github.com/syou6162/go-active-learning/lib/repository" 13 | ) 14 | 15 | func TestMain(m *testing.M) { 16 | repo, err := repository.New() 17 | if err != nil { 18 | log.Fatal(err.Error()) 19 | } 20 | defer repo.Close() 21 | 22 | ret := m.Run() 23 | os.Exit(ret) 24 | } 25 | 26 | func TestPing(t *testing.T) { 27 | repo, err := repository.New() 28 | if err != nil { 29 | t.Errorf(err.Error()) 30 | } 31 | defer repo.Close() 32 | 33 | if err := repo.Ping(); err != nil { 34 | t.Errorf(err.Error()) 35 | } 36 | } 37 | 38 | func TestInsertExamplesFromReader(t *testing.T) { 39 | repo, err := repository.New() 40 | if err != nil { 41 | t.Errorf(err.Error()) 42 | } 43 | defer repo.Close() 44 | 45 | if err = repo.DeleteAllExamples(); err != nil { 46 | t.Error(err) 47 | } 48 | 49 | fp, err := os.Open("../../tech_input_example.txt") 50 | defer fp.Close() 51 | if err != nil { 52 | t.Error(err) 53 | } 54 | repo.InsertExamplesFromReader(fp) 55 | 56 | examples, err := repo.SearchExamples() 57 | if err != nil { 58 | t.Error(err) 59 | } 60 | if len(examples) == 0 { 61 | t.Errorf("len(examples) > 0, but %d", len(examples)) 62 | } 63 | } 64 | 65 | func TestInsertOrUpdateExample(t *testing.T) { 66 | repo, err := repository.New() 67 | if err != nil { 68 | t.Errorf(err.Error()) 69 | } 70 | defer repo.Close() 71 | 72 | if err = repo.DeleteAllExamples(); err != nil { 73 | t.Error(err) 74 | } 75 | 76 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.UNLABELED)) 77 | if err != nil { 78 | t.Error(err) 79 | } 80 | 81 | examples, err := repo.SearchExamples() 82 | if err != nil { 83 | t.Error(err) 84 | } 85 | if len(examples) != 1 { 86 | t.Errorf("len(examples) == %d, want 1", len(examples)) 87 | } 88 | if examples[0].Label != model.UNLABELED { 89 | t.Errorf("label == %d, want 0", examples[0].Label) 90 | } 91 | if examples[0].Id == 0 { 92 | t.Error("id must not be 0") 93 | } 94 | 95 | // same url 96 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.NEGATIVE)) 97 | if err != nil { 98 | t.Error(err) 99 | } 100 | 101 | examples, err = repo.SearchExamples() 102 | if err != nil { 103 | t.Error(err) 104 | } 105 | if len(examples) != 1 { 106 | t.Errorf("len(examples) == %d, want 1", len(examples)) 107 | } 108 | if examples[0].Label != model.NEGATIVE { 109 | t.Errorf("label == %d, want -1", examples[0].Label) 110 | } 111 | 112 | // same url but different label 113 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.POSITIVE)) 114 | if err != nil { 115 | t.Error(err) 116 | } 117 | 118 | examples, err = repo.SearchExamples() 119 | if err != nil { 120 | t.Error(err) 121 | } 122 | if len(examples) != 1 { 123 | t.Errorf("len(examples) == %d, want 1", len(examples)) 124 | } 125 | if examples[0].Label != model.POSITIVE { 126 | t.Errorf("label == %d, want 1", examples[0].Label) 127 | } 128 | 129 | // cannot update to unlabeled 130 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.UNLABELED)) 131 | if err != nil { 132 | t.Error(err) 133 | } 134 | 135 | examples, err = repo.SearchExamples() 136 | if err != nil { 137 | t.Error(err) 138 | } 139 | if len(examples) != 1 { 140 | t.Errorf("len(examples) == %d, want 1", len(examples)) 141 | } 142 | if examples[0].Label != model.POSITIVE { 143 | t.Errorf("label == %d, want 1", examples[0].Label) 144 | } 145 | 146 | // different url 147 | err = repo.UpdateOrCreateExample(example.NewExample("http://another.com", model.NEGATIVE)) 148 | if err != nil { 149 | t.Error(err) 150 | } 151 | 152 | examples, err = repo.SearchExamples() 153 | if err != nil { 154 | t.Error(err) 155 | } 156 | if len(examples) != 2 { 157 | t.Errorf("len(examples) == %d, want 2", len(examples)) 158 | } 159 | } 160 | 161 | func TestUpdateScore(t *testing.T) { 162 | repo, err := repository.New() 163 | if err != nil { 164 | t.Errorf(err.Error()) 165 | } 166 | defer repo.Close() 167 | 168 | if err = repo.DeleteAllExamples(); err != nil { 169 | t.Error(err) 170 | } 171 | 172 | url := "http://hoge.com" 173 | e := example.NewExample(url, model.UNLABELED) 174 | e.Score = 1.0 175 | err = repo.UpdateOrCreateExample(e) 176 | if err != nil { 177 | t.Error(err) 178 | } 179 | 180 | e, err = repo.FindExampleByUlr(url) 181 | if err != nil { 182 | t.Error(err) 183 | } 184 | if e.Score != 1.0 { 185 | t.Errorf("e.Score == %f, want 1.0", e.Score) 186 | } 187 | 188 | e.Score = 100.0 189 | err = repo.UpdateScore(e) 190 | if err != nil { 191 | t.Error(err) 192 | } 193 | 194 | e, err = repo.FindExampleByUlr(url) 195 | if err != nil { 196 | t.Error(err) 197 | } 198 | if e.Score != 100.0 { 199 | t.Errorf("e.Score == %f, want 100.0", e.Score) 200 | } 201 | } 202 | 203 | func TestErrorCount(t *testing.T) { 204 | repo, err := repository.New() 205 | if err != nil { 206 | t.Errorf(err.Error()) 207 | } 208 | defer repo.Close() 209 | 210 | if err = repo.DeleteAllExamples(); err != nil { 211 | t.Error(err) 212 | } 213 | 214 | existingUrl := example.NewExample("https://github.com", model.POSITIVE) 215 | nonExistingUrl := example.NewExample("http://hoge.fuga", model.NEGATIVE) 216 | examples := model.Examples{existingUrl, nonExistingUrl} 217 | 218 | for _, e := range examples { 219 | if err := repo.UpdateOrCreateExample(e); err != nil { 220 | t.Error(err) 221 | } 222 | 223 | cnt, err := repo.GetErrorCount(e) 224 | if err != nil { 225 | t.Errorf("Cannot get error count: %s", err.Error()) 226 | } 227 | if cnt != 0 { 228 | t.Errorf("Error count must be 0 for %s", e.Url) 229 | } 230 | } 231 | 232 | for _, e := range examples { 233 | err := repo.IncErrorCount(e) 234 | if err != nil { 235 | t.Errorf("Cannot get error count: %s", err.Error()) 236 | } 237 | } 238 | 239 | for _, e := range examples { 240 | cnt, err := repo.GetErrorCount(e) 241 | if err != nil { 242 | t.Errorf("Cannot get error count: %s", err.Error()) 243 | } 244 | if cnt != 1 { 245 | t.Errorf("Error count must be 1 for %s", e.Url) 246 | } 247 | } 248 | } 249 | 250 | func TestReadLabeledExamples(t *testing.T) { 251 | repo, err := repository.New() 252 | if err != nil { 253 | t.Errorf(err.Error()) 254 | } 255 | defer repo.Close() 256 | 257 | if err = repo.DeleteAllExamples(); err != nil { 258 | t.Error(err) 259 | } 260 | 261 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE)) 262 | if err != nil { 263 | t.Error(err) 264 | } 265 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 266 | if err != nil { 267 | t.Error(err) 268 | } 269 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 270 | if err != nil { 271 | t.Error(err) 272 | } 273 | 274 | examples, err := repo.SearchLabeledExamples(10) 275 | if err != nil { 276 | t.Error(err) 277 | } 278 | if len(examples) != 2 { 279 | t.Errorf("len(examples) == %d, want 2", len(examples)) 280 | } 281 | } 282 | 283 | func TestReadRecentExamples(t *testing.T) { 284 | repo, err := repository.New() 285 | if err != nil { 286 | t.Errorf(err.Error()) 287 | } 288 | defer repo.Close() 289 | 290 | if err = repo.DeleteAllExamples(); err != nil { 291 | t.Error(err) 292 | } 293 | 294 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE)) 295 | if err != nil { 296 | t.Error(err) 297 | } 298 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 299 | if err != nil { 300 | t.Error(err) 301 | } 302 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 303 | if err != nil { 304 | t.Error(err) 305 | } 306 | 307 | examples, err := repo.SearchRecentExamples(time.Now().Add(time.Duration(-10)*time.Minute), 10) 308 | if err != nil { 309 | t.Error(err) 310 | } 311 | if len(examples) != 3 { 312 | t.Errorf("len(examples) == %d, want 3", len(examples)) 313 | } 314 | } 315 | 316 | func TestReadRecentExamplesByHost(t *testing.T) { 317 | repo, err := repository.New() 318 | if err != nil { 319 | t.Errorf(err.Error()) 320 | } 321 | defer repo.Close() 322 | 323 | if err = repo.DeleteAllExamples(); err != nil { 324 | t.Error(err) 325 | } 326 | 327 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE)) 328 | if err != nil { 329 | t.Error(err) 330 | } 331 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 332 | if err != nil { 333 | t.Error(err) 334 | } 335 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 336 | if err != nil { 337 | t.Error(err) 338 | } 339 | 340 | examples, err := repo.SearchRecentExamplesByHost("http://hoge1.com", time.Now().Add(time.Duration(-10)*time.Minute), 10) 341 | if err != nil { 342 | t.Error(err) 343 | } 344 | if len(examples) != 1 { 345 | t.Errorf("len(examples) == %d, want 1", len(examples)) 346 | } 347 | } 348 | 349 | func TestSearchExamplesByUlr(t *testing.T) { 350 | repo, err := repository.New() 351 | if err != nil { 352 | t.Errorf(err.Error()) 353 | } 354 | defer repo.Close() 355 | 356 | if err = repo.DeleteAllExamples(); err != nil { 357 | t.Error(err) 358 | } 359 | 360 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.NEGATIVE)) 361 | if err != nil { 362 | t.Error(err) 363 | } 364 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 365 | if err != nil { 366 | t.Error(err) 367 | } 368 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 369 | if err != nil { 370 | t.Error(err) 371 | } 372 | 373 | example, err := repo.FindExampleByUlr("http://hoge1.com") 374 | if err != nil { 375 | t.Error(err) 376 | } 377 | if example.Url == "" { 378 | t.Errorf("example.Url == %s, want http://hoge1.com", example.Url) 379 | } 380 | 381 | example, err = repo.FindExampleByUlr("http://hoge4.com") 382 | if err == nil { 383 | t.Errorf("search result must be nil") 384 | } 385 | } 386 | 387 | func TestSearchExamplesByUlrs(t *testing.T) { 388 | repo, err := repository.New() 389 | if err != nil { 390 | t.Errorf(err.Error()) 391 | } 392 | defer repo.Close() 393 | 394 | if err = repo.DeleteAllExamples(); err != nil { 395 | t.Error(err) 396 | } 397 | 398 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.NEGATIVE)) 399 | if err != nil { 400 | t.Error(err) 401 | } 402 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 403 | if err != nil { 404 | t.Error(err) 405 | } 406 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 407 | if err != nil { 408 | t.Error(err) 409 | } 410 | 411 | examples, err := repo.SearchExamplesByUlrs([]string{"http://hoge1.com", "http://hoge2.com"}) 412 | if err != nil { 413 | t.Error(err) 414 | } 415 | if len(examples) != 2 { 416 | t.Errorf("len(examples) == %d, want 2", len(examples)) 417 | } 418 | } 419 | 420 | func TestSearchExamplesByLabels(t *testing.T) { 421 | repo, err := repository.New() 422 | if err != nil { 423 | t.Errorf(err.Error()) 424 | } 425 | defer repo.Close() 426 | 427 | if err = repo.DeleteAllExamples(); err != nil { 428 | t.Error(err) 429 | } 430 | 431 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE)) 432 | if err != nil { 433 | t.Error(err) 434 | } 435 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 436 | if err != nil { 437 | t.Error(err) 438 | } 439 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 440 | if err != nil { 441 | t.Error(err) 442 | } 443 | 444 | examples, err := repo.SearchPositiveExamples(10) 445 | if err != nil { 446 | t.Error(err) 447 | } 448 | if len(examples) != 1 { 449 | t.Errorf("len(examples) == %d, want 1", len(examples)) 450 | } 451 | 452 | examples, err = repo.SearchNegativeExamples(10) 453 | if err != nil { 454 | t.Error(err) 455 | } 456 | if len(examples) != 1 { 457 | t.Errorf("len(examples) == %d, want 1", len(examples)) 458 | } 459 | 460 | examples, err = repo.SearchUnlabeledExamples(10) 461 | if err != nil { 462 | t.Error(err) 463 | } 464 | if len(examples) != 1 { 465 | t.Errorf("len(examples) == %d, want 1", len(examples)) 466 | } 467 | } 468 | 469 | func TestCountExamplesByLabels(t *testing.T) { 470 | repo, err := repository.New() 471 | if err != nil { 472 | t.Errorf(err.Error()) 473 | } 474 | defer repo.Close() 475 | 476 | if err = repo.DeleteAllExamples(); err != nil { 477 | t.Error(err) 478 | } 479 | 480 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE)) 481 | if err != nil { 482 | t.Error(err) 483 | } 484 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE)) 485 | if err != nil { 486 | t.Error(err) 487 | } 488 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED)) 489 | if err != nil { 490 | t.Error(err) 491 | } 492 | 493 | cnt, err := repo.CountPositiveExamples() 494 | if err != nil { 495 | t.Error(err) 496 | } 497 | if cnt != 1 { 498 | t.Errorf("len(posExamples) == %d, want 1", cnt) 499 | } 500 | 501 | cnt, err = repo.CountNegativeExamples() 502 | if err != nil { 503 | t.Error(err) 504 | } 505 | if cnt != 1 { 506 | t.Errorf("len(negExamples) == %d, want 1", cnt) 507 | } 508 | 509 | cnt, err = repo.CountUnlabeledExamples() 510 | if err != nil { 511 | t.Error(err) 512 | } 513 | if cnt != 1 { 514 | t.Errorf("len(unlabeledExamples) == %d, want 1", cnt) 515 | } 516 | } 517 | 518 | func TestFeatureVectorReadWrite(t *testing.T) { 519 | repo, err := repository.New() 520 | if err != nil { 521 | t.Errorf(err.Error()) 522 | } 523 | defer repo.Close() 524 | 525 | if err = repo.DeleteAllExamples(); err != nil { 526 | t.Error(err) 527 | } 528 | 529 | e1 := example.NewExample("http://hoge.com", model.UNLABELED) 530 | err = repo.UpdateOrCreateExample(e1) 531 | if err != nil { 532 | t.Error(err) 533 | } 534 | e1.Fv = feature.FeatureVector{"BIAS"} 535 | 536 | if err = repo.UpdateFeatureVector(e1); err != nil { 537 | t.Error(err) 538 | } 539 | 540 | fv, err := repo.FindFeatureVector(e1) 541 | if err != nil { 542 | t.Error(err) 543 | } 544 | if len(fv) != 1 { 545 | t.Errorf("len(fv) == %d, want 1", len(fv)) 546 | } 547 | 548 | e2 := example.NewExample("http://fuga.com", model.UNLABELED) 549 | err = repo.UpdateOrCreateExample(e2) 550 | if err != nil { 551 | t.Error(err) 552 | } 553 | e2.Fv = feature.FeatureVector{"hoge"} 554 | if err = repo.UpdateFeatureVector(e2); err != nil { 555 | t.Error(err) 556 | } 557 | fvList, err := repo.SearchFeatureVector(model.Examples{e1, e2}) 558 | if err != nil { 559 | t.Error(err) 560 | } 561 | if len(fvList) != 2 { 562 | t.Errorf("len(fvList) == %d, want 2", len(fvList)) 563 | } 564 | if fvList[e2.Id][0] != "hoge" { 565 | t.Errorf("fvList[e2.Id][0] == %s, want hoge", fvList[e2.Id][0]) 566 | } 567 | } 568 | 569 | func TestSearchExamplesByWords(t *testing.T) { 570 | repo, err := repository.New() 571 | if err != nil { 572 | t.Errorf(err.Error()) 573 | } 574 | defer repo.Close() 575 | 576 | if err = repo.DeleteAllExamples(); err != nil { 577 | t.Error(err) 578 | } 579 | 580 | e1 := example.NewExample("http://hoge.com", model.UNLABELED) 581 | e1.Title = "日本語" 582 | err = repo.UpdateOrCreateExample(e1) 583 | if err != nil { 584 | t.Error(err) 585 | } 586 | 587 | e2 := example.NewExample("http://fuga.com", model.UNLABELED) 588 | e2.Title = "英語" 589 | err = repo.UpdateOrCreateExample(e2) 590 | if err != nil { 591 | t.Error(err) 592 | } 593 | 594 | examples, err := repo.SearchExamplesByKeywords([]string{"日本語"}, "ALL", 100) 595 | if len(examples) != 1 { 596 | t.Errorf("len(examples) == %d, want 1", len(examples)) 597 | } 598 | examples, err = repo.SearchExamplesByKeywords([]string{"語"}, "ALL", 100) 599 | if len(examples) != 2 { 600 | t.Errorf("len(examples) == %d, want 2", len(examples)) 601 | } 602 | examples, err = repo.SearchExamplesByKeywords([]string{"日本語", "英語"}, "ALL", 100) 603 | if len(examples) != 0 { 604 | t.Errorf("len(examples) == %d, want 0", len(examples)) 605 | } 606 | examples, err = repo.SearchExamplesByKeywords([]string{"日本語", "英語"}, "ANY", 100) 607 | if len(examples) != 2 { 608 | t.Errorf("len(examples) == %d, want 2", len(examples)) 609 | } 610 | } 611 | -------------------------------------------------------------------------------- /lib/repository/hatena_bookmark.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "github.com/lib/pq" 5 | "github.com/syou6162/go-active-learning/lib/model" 6 | ) 7 | 8 | var hatenaBookmarkNotFoundError = model.NotFoundError("hatenaBookmark") 9 | 10 | func (r *repository) UpdateHatenaBookmark(e *model.Example) error { 11 | if e.HatenaBookmark == nil || e.HatenaBookmark.Count == 0 { 12 | return nil 13 | } 14 | 15 | tmp, err := r.FindExampleByUlr(e.Url) 16 | if err != nil { 17 | return err 18 | } 19 | id := tmp.Id 20 | 21 | e.HatenaBookmark.ExampleId = id 22 | if _, err = r.db.NamedExec(` 23 | INSERT INTO hatena_bookmark 24 | ( example_id, title, screenshot, entry_url, count, url, eid) 25 | VALUES 26 | (:example_id, :title, :screenshot, :entry_url, :count, :url, :eid) 27 | ON CONFLICT (example_id) 28 | DO UPDATE SET 29 | title = :title, count = :count 30 | ;`, e.HatenaBookmark); err != nil { 31 | return err 32 | } 33 | 34 | hb := model.HatenaBookmark{} 35 | if err = r.db.Get(&hb, `SELECT id FROM hatena_bookmark WHERE example_id = $1;`, id); err != nil { 36 | return err 37 | } 38 | 39 | for _, b := range e.HatenaBookmark.Bookmarks { 40 | b.HatenaBookmarkId = hb.Id 41 | if _, err = r.db.NamedExec(` 42 | INSERT INTO bookmark 43 | (hatena_bookmark_id, "user", comment, timestamp, tags) 44 | VALUES 45 | (:hatena_bookmark_id, :user, :comment, :timestamp, :tags) 46 | ON CONFLICT (hatena_bookmark_id, "user") DO NOTHING 47 | ;`, b); err != nil { 48 | return err 49 | } 50 | } 51 | return nil 52 | } 53 | 54 | func (r *repository) SearchHatenaBookmarks(examples model.Examples, limitForEachExample int) ([]*model.HatenaBookmark, error) { 55 | hatenaBookmarks := make([]*model.HatenaBookmark, 0) 56 | exampleIds := make([]int, 0) 57 | for _, e := range examples { 58 | exampleIds = append(exampleIds, e.Id) 59 | } 60 | 61 | query := `SELECT * FROM hatena_bookmark WHERE example_id = ANY($1);` 62 | err := r.db.Select(&hatenaBookmarks, query, pq.Array(exampleIds)) 63 | if err != nil { 64 | return hatenaBookmarks, err 65 | } 66 | 67 | hatenaBookmarkIds := make([]int, 0) 68 | for _, hb := range hatenaBookmarks { 69 | hatenaBookmarkIds = append(hatenaBookmarkIds, hb.Id) 70 | hb.Bookmarks = make([]*model.Bookmark, 0) 71 | } 72 | if limitForEachExample == 0 { 73 | return hatenaBookmarks, nil 74 | } 75 | 76 | bookmarks := make([]*model.Bookmark, 0) 77 | query = `SELECT * FROM bookmark WHERE hatena_bookmark_id = ANY($1) ORDER BY timestamp LIMIT $2;` 78 | err = r.db.Select(&bookmarks, query, pq.Array(hatenaBookmarkIds), limitForEachExample) 79 | if err != nil { 80 | return hatenaBookmarks, err 81 | } 82 | 83 | bookmarksByHatenaBookmarkId := make(map[int][]*model.Bookmark) 84 | for _, b := range bookmarks { 85 | bookmarksByHatenaBookmarkId[b.HatenaBookmarkId] = append(bookmarksByHatenaBookmarkId[b.HatenaBookmarkId], b) 86 | } 87 | 88 | result := make([]*model.HatenaBookmark, 0) 89 | for _, hb := range hatenaBookmarks { 90 | bookmarks := bookmarksByHatenaBookmarkId[hb.Id] 91 | hb.Bookmarks = bookmarks 92 | result = append(result, hb) 93 | } 94 | return result, nil 95 | } 96 | 97 | func (r *repository) FindHatenaBookmark(e *model.Example, limit int) (*model.HatenaBookmark, error) { 98 | hatenaBookmark := &model.HatenaBookmark{} 99 | 100 | query := `SELECT * FROM hatena_bookmark WHERE example_id = $1;` 101 | err := r.db.Get(hatenaBookmark, query, e.Id) 102 | if err != nil { 103 | return hatenaBookmark, err 104 | } 105 | 106 | bookmarks := make([]*model.Bookmark, 0) 107 | if limit == 0 { 108 | hatenaBookmark.Bookmarks = bookmarks 109 | return hatenaBookmark, nil 110 | } 111 | 112 | hatenaBookmarkId := hatenaBookmark.Id 113 | query = `SELECT * FROM bookmark WHERE hatena_bookmark_id = $1 ORDER BY timestamp LIMIT $2;` 114 | err = r.db.Select(&bookmarks, query, hatenaBookmarkId, limit) 115 | if err != nil { 116 | return hatenaBookmark, err 117 | } 118 | 119 | hatenaBookmark.Bookmarks = bookmarks 120 | return hatenaBookmark, nil 121 | } 122 | -------------------------------------------------------------------------------- /lib/repository/hatena_bookmark_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/syou6162/go-active-learning/lib/example" 8 | "github.com/syou6162/go-active-learning/lib/model" 9 | "github.com/syou6162/go-active-learning/lib/repository" 10 | ) 11 | 12 | func TestUpdateHatenaBookmark(t *testing.T) { 13 | repo, err := repository.New() 14 | if err != nil { 15 | t.Errorf(err.Error()) 16 | } 17 | defer repo.Close() 18 | 19 | if err = repo.DeleteAllExamples(); err != nil { 20 | t.Error(err) 21 | } 22 | 23 | e := example.NewExample("http://hoge.com", model.UNLABELED) 24 | err = repo.UpdateOrCreateExample(e) 25 | if err != nil { 26 | t.Error(err) 27 | } 28 | now := time.Now() 29 | b1 := model.Bookmark{ 30 | User: "syou6162", 31 | Comment: "面白いサイトですね", 32 | Timestamp: model.HatenaBookmarkTime{Time: &now}, 33 | Tags: model.Tags{"hack"}, 34 | } 35 | hb := model.HatenaBookmark{ 36 | ExampleId: e.Id, 37 | Title: "hoge", 38 | Count: 10, 39 | Bookmarks: []*model.Bookmark{&b1}, 40 | } 41 | e.HatenaBookmark = &hb 42 | if err = repo.UpdateHatenaBookmark(e); err != nil { 43 | t.Error(err) 44 | } 45 | 46 | { 47 | result, err := repo.SearchHatenaBookmarks(model.Examples{e}, 10) 48 | if err != nil { 49 | t.Error(err) 50 | } 51 | 52 | for _, tmp := range result { 53 | if tmp.Title == "" { 54 | t.Error("Title must not be empty") 55 | } 56 | for _, b := range tmp.Bookmarks { 57 | if b.User == "" { 58 | t.Error("User must not be empty") 59 | } 60 | if len(b.Tags) == 0 { 61 | t.Error("Tags must not be empty") 62 | } 63 | } 64 | } 65 | } 66 | 67 | { 68 | result, err := repo.FindHatenaBookmark(e, 10) 69 | if err != nil { 70 | t.Error(err) 71 | } 72 | 73 | if result.Title == "" { 74 | t.Error("Title must not be empty") 75 | } 76 | for _, b := range result.Bookmarks { 77 | if b.User == "" { 78 | t.Error("User must not be empty") 79 | } 80 | if len(b.Tags) == 0 { 81 | t.Error("Tags must not be empty") 82 | } 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /lib/repository/mira.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "encoding/json" 5 | 6 | "github.com/syou6162/go-active-learning/lib/classifier" 7 | ) 8 | 9 | func (r *repository) InsertMIRAModel(m classifier.MIRAClassifier) error { 10 | bytes, err := json.Marshal(m) 11 | if err != nil { 12 | return err 13 | } 14 | query := `INSERT INTO model (model_type, model, c, accuracy, precision, recall, fvalue) VALUES ($1, $2, $3, $4, $5, $6, $7);` 15 | if _, err := r.db.Exec(query, m.ModelType, string(bytes), m.C, m.Accuracy, m.Precision, m.Recall, m.Fvalue); err != nil { 16 | return err 17 | } 18 | return nil 19 | } 20 | 21 | func (r *repository) FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) { 22 | type Classifier struct { 23 | Model string 24 | } 25 | tmp := Classifier{} 26 | 27 | query := `SELECT model FROM model WHERE model_type = $1 ORDER BY created_at DESC LIMIT 1;` 28 | err := r.db.Get(&tmp, query, modelType) 29 | if err != nil { 30 | return nil, err 31 | } 32 | 33 | clf := classifier.MIRAClassifier{} 34 | if err := json.Unmarshal(([]byte)(tmp.Model), &clf); err != nil { 35 | return nil, err 36 | } 37 | return &clf, nil 38 | } 39 | -------------------------------------------------------------------------------- /lib/repository/mira_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/classifier" 7 | "github.com/syou6162/go-active-learning/lib/repository" 8 | ) 9 | 10 | func TestInsertMIRAModel(t *testing.T) { 11 | repo, err := repository.New() 12 | if err != nil { 13 | t.Errorf(err.Error()) 14 | } 15 | defer repo.Close() 16 | 17 | weight := make(map[string]float64) 18 | weight["hoge"] = 1.0 19 | weight["fuga"] = 1.0 20 | clf := classifier.MIRAClassifier{classifier.EXAMPLE, weight, 10.0, 0.0, 0.0, 0.0, 0.0} 21 | err = repo.InsertMIRAModel(clf) 22 | if err != nil { 23 | t.Error(err) 24 | } 25 | 26 | { 27 | clf, err := repo.FindLatestMIRAModel(classifier.EXAMPLE) 28 | if err != nil { 29 | t.Error(err) 30 | } 31 | if len(clf.Weight) == 0 { 32 | t.Error("weight must not be empty") 33 | } 34 | if clf.C != 10.0 { 35 | t.Error("C must be 10.0") 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /lib/repository/recommendation.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "github.com/lib/pq" 5 | "github.com/syou6162/go-active-learning/lib/model" 6 | ) 7 | 8 | func (r *repository) UpdateRecommendation(rec model.Recommendation) error { 9 | if _, err := r.db.Exec(`DELETE FROM recommendation WHERE list_type = $1;`, rec.RecommendationListType); err != nil { 10 | return err 11 | } 12 | if _, err := r.db.Exec(`INSERT INTO recommendation (list_type, example_id) VALUES ($1, unnest(cast($2 AS INT[])));`, rec.RecommendationListType, pq.Array(rec.ExampleIds)); err != nil { 13 | return err 14 | } 15 | return nil 16 | } 17 | 18 | func (r *repository) FindRecommendation(t model.RecommendationListType) (*model.Recommendation, error) { 19 | rec := &model.Recommendation{RecommendationListType: t} 20 | items := make([]int, 0) 21 | query := `SELECT example_id FROM recommendation WHERE list_type = $1;` 22 | err := r.db.Select(&items, query, t) 23 | if err != nil { 24 | return nil, err 25 | } 26 | rec.ExampleIds = items 27 | return rec, nil 28 | } 29 | -------------------------------------------------------------------------------- /lib/repository/recommendation_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/example" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | "github.com/syou6162/go-active-learning/lib/repository" 9 | ) 10 | 11 | func TestUpdateRecommendation(t *testing.T) { 12 | repo, err := repository.New() 13 | if err != nil { 14 | t.Errorf(err.Error()) 15 | } 16 | defer repo.Close() 17 | 18 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE) 19 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE) 20 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED) 21 | examples := model.Examples{e1, e2, e3} 22 | for _, e := range examples { 23 | err = repo.UpdateOrCreateExample(e) 24 | if err != nil { 25 | t.Error(err) 26 | } 27 | } 28 | rec := model.Recommendation{RecommendationListType: model.GENERAL, ExampleIds: []int{e1.Id, e2.Id, e3.Id}} 29 | err = repo.UpdateRecommendation(rec) 30 | if err != nil { 31 | t.Error(err) 32 | } 33 | 34 | { 35 | rec, err := repo.FindRecommendation(model.GENERAL) 36 | if err != nil { 37 | t.Error(err) 38 | } 39 | if len(rec.ExampleIds) != 3 { 40 | t.Error("len(rec.ExampleIds) must be 3") 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /lib/repository/related_example.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "github.com/lib/pq" 5 | "github.com/syou6162/go-active-learning/lib/model" 6 | ) 7 | 8 | func (r *repository) UpdateRelatedExamples(related model.RelatedExamples) error { 9 | if _, err := r.db.Exec(`DELETE FROM related_example WHERE example_id = $1;`, related.ExampleId); err != nil { 10 | return err 11 | } 12 | if _, err := r.db.Exec(`INSERT INTO related_example (example_id, related_example_id) VALUES ($1, unnest(cast($2 AS INT[])));`, related.ExampleId, pq.Array(related.RelatedExampleIds)); err != nil { 13 | return err 14 | } 15 | return nil 16 | } 17 | 18 | func (r *repository) FindRelatedExamples(e *model.Example) (*model.RelatedExamples, error) { 19 | related := &model.RelatedExamples{ExampleId: e.Id} 20 | items := make([]int, 0) 21 | query := `SELECT related_example_id FROM related_example WHERE example_id = $1;` 22 | err := r.db.Select(&items, query, e.Id) 23 | if err != nil { 24 | return nil, err 25 | } 26 | related.RelatedExampleIds = items 27 | return related, nil 28 | } 29 | -------------------------------------------------------------------------------- /lib/repository/related_example_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/example" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | "github.com/syou6162/go-active-learning/lib/repository" 9 | ) 10 | 11 | func TestUpdateRelatedExamples(t *testing.T) { 12 | repo, err := repository.New() 13 | if err != nil { 14 | t.Errorf(err.Error()) 15 | } 16 | defer repo.Close() 17 | 18 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE) 19 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE) 20 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED) 21 | examples := model.Examples{e1, e2, e3} 22 | for _, e := range examples { 23 | err = repo.UpdateOrCreateExample(e) 24 | if err != nil { 25 | t.Error(err) 26 | } 27 | } 28 | related := model.RelatedExamples{ExampleId: e1.Id, RelatedExampleIds: []int{e2.Id, e3.Id}} 29 | err = repo.UpdateRelatedExamples(related) 30 | if err != nil { 31 | t.Error(err) 32 | } 33 | 34 | { 35 | related, err := repo.FindRelatedExamples(e1) 36 | if err != nil { 37 | t.Error(err) 38 | } 39 | if len(related.RelatedExampleIds) != 2 { 40 | t.Error("len(related.RelatedExampleIds) must be 2") 41 | } 42 | } 43 | { 44 | related, err := repo.FindRelatedExamples(e2) 45 | if err != nil { 46 | t.Error(err) 47 | } 48 | if len(related.RelatedExampleIds) != 0 { 49 | t.Error("len(related.RelatedExampleIds) must be 0") 50 | } 51 | } 52 | } 53 | 54 | func TestUpdateRelatedExamplesMyOwn(t *testing.T) { 55 | repo, err := repository.New() 56 | if err != nil { 57 | t.Errorf(err.Error()) 58 | } 59 | defer repo.Close() 60 | 61 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE) 62 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE) 63 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED) 64 | examples := model.Examples{e1, e2, e3} 65 | for _, e := range examples { 66 | err = repo.UpdateOrCreateExample(e) 67 | if err != nil { 68 | t.Error(err) 69 | } 70 | } 71 | related := model.RelatedExamples{ExampleId: e1.Id, RelatedExampleIds: []int{e1.Id, e2.Id, e3.Id}} 72 | err = repo.UpdateRelatedExamples(related) 73 | if err == nil { 74 | t.Error("自身と同一のexample_idを持つ事例はrelated_example_idに追加できない") 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /lib/repository/repository.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "time" 7 | 8 | "github.com/jmoiron/sqlx" 9 | 10 | "bufio" 11 | 12 | _ "github.com/lib/pq" 13 | "github.com/syou6162/go-active-learning/lib/classifier" 14 | "github.com/syou6162/go-active-learning/lib/feature" 15 | "github.com/syou6162/go-active-learning/lib/model" 16 | "github.com/syou6162/go-active-learning/lib/util" 17 | ) 18 | 19 | type Repository interface { 20 | UpdateOrCreateExample(e *model.Example) error 21 | UpdateScore(e *model.Example) error 22 | InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) 23 | InsertExamplesFromReader(reader io.Reader) error 24 | SearchExamples() (model.Examples, error) 25 | SearchRecentExamples(from time.Time, limit int) (model.Examples, error) 26 | SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) 27 | SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) 28 | SearchLabeledExamples(limit int) (model.Examples, error) 29 | SearchPositiveExamples(limit int) (model.Examples, error) 30 | SearchNegativeExamples(limit int) (model.Examples, error) 31 | SearchUnlabeledExamples(limit int) (model.Examples, error) 32 | SearchPositiveScoredExamples(limit int) (model.Examples, error) 33 | FindExampleByUlr(url string) (*model.Example, error) 34 | FindExampleById(id int) (*model.Example, error) 35 | SearchExamplesByUlrs(urls []string) (model.Examples, error) 36 | SearchExamplesByIds(ids []int) (model.Examples, error) 37 | SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) 38 | DeleteAllExamples() error 39 | 40 | CountPositiveExamples() (int, error) 41 | CountNegativeExamples() (int, error) 42 | CountUnlabeledExamples() (int, error) 43 | 44 | IncErrorCount(e *model.Example) error 45 | GetErrorCount(e *model.Example) (int, error) 46 | 47 | UpdateFeatureVector(e *model.Example) error 48 | FindFeatureVector(e *model.Example) (feature.FeatureVector, error) 49 | SearchFeatureVector(examples model.Examples) (map[int]feature.FeatureVector, error) 50 | 51 | UpdateHatenaBookmark(e *model.Example) error 52 | SearchHatenaBookmarks(examples model.Examples, limitForEachExample int) ([]*model.HatenaBookmark, error) 53 | FindHatenaBookmark(e *model.Example, limit int) (*model.HatenaBookmark, error) 54 | 55 | UpdateOrCreateReferringTweets(e *model.Example) error 56 | UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error 57 | SearchReferringTweetsList(examples model.Examples, limit int) (map[int]model.ReferringTweets, error) 58 | SearchReferringTweets(limit int) (model.ReferringTweets, error) 59 | SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) 60 | SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) 61 | SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) 62 | FindReferringTweets(e *model.Example, limit int) (model.ReferringTweets, error) 63 | SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) 64 | 65 | InsertMIRAModel(m classifier.MIRAClassifier) error 66 | FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) 67 | 68 | UpdateRecommendation(rec model.Recommendation) error 69 | FindRecommendation(t model.RecommendationListType) (*model.Recommendation, error) 70 | 71 | UpdateRelatedExamples(related model.RelatedExamples) error 72 | FindRelatedExamples(e *model.Example) (*model.RelatedExamples, error) 73 | 74 | UpdateTopAccessedExampleIds([]int) error 75 | SearchTopAccessedExampleIds() ([]int, error) 76 | 77 | Ping() error 78 | Close() error 79 | } 80 | 81 | type repository struct { 82 | db *sqlx.DB 83 | } 84 | 85 | func GetDataSourceName() string { 86 | host := util.GetEnv("POSTGRES_HOST", "localhost") 87 | dbUser := util.GetEnv("DB_USER", "nobody") 88 | dbPassword := util.GetEnv("DB_PASSWORD", "nobody") 89 | dbName := util.GetEnv("DB_NAME", "go-active-learning") 90 | return fmt.Sprintf( 91 | "host=%s user=%s password=%s dbname=%s sslmode=disable", 92 | host, dbUser, dbPassword, dbName, 93 | ) 94 | } 95 | 96 | func New() (*repository, error) { 97 | db, err := sqlx.Open("postgres", GetDataSourceName()) 98 | if err != nil { 99 | return nil, err 100 | } 101 | db.SetMaxOpenConns(50) 102 | return &repository{db: db}, nil 103 | } 104 | 105 | func (r *repository) Ping() error { 106 | return r.db.Ping() 107 | } 108 | 109 | func (r *repository) Close() error { 110 | if r.db != nil { 111 | return r.db.Close() 112 | } else { 113 | return nil 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /lib/repository/top_accessed_example.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "github.com/lib/pq" 5 | ) 6 | 7 | func (r *repository) UpdateTopAccessedExampleIds(exampleIds []int) error { 8 | if _, err := r.db.Exec(`DELETE FROM top_accessed_example;`); err != nil { 9 | return err 10 | } 11 | if _, err := r.db.Exec(`INSERT INTO top_accessed_example (example_id) VALUES (unnest(cast($1 AS INT[])));`, pq.Array(exampleIds)); err != nil { 12 | return err 13 | } 14 | return nil 15 | } 16 | 17 | func (r *repository) SearchTopAccessedExampleIds() ([]int, error) { 18 | exampleIds := make([]int, 0) 19 | query := `SELECT example_id FROM top_accessed_example;` 20 | err := r.db.Select(&exampleIds, query) 21 | if err != nil { 22 | return nil, err 23 | } 24 | return exampleIds, nil 25 | } 26 | -------------------------------------------------------------------------------- /lib/repository/top_accessed_example_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/example" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | "github.com/syou6162/go-active-learning/lib/repository" 9 | ) 10 | 11 | func TestUpdateTopAccessedExampleIds(t *testing.T) { 12 | repo, err := repository.New() 13 | if err != nil { 14 | t.Errorf(err.Error()) 15 | } 16 | defer repo.Close() 17 | 18 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE) 19 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE) 20 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED) 21 | examples := model.Examples{e1, e2, e3} 22 | for _, e := range examples { 23 | err = repo.UpdateOrCreateExample(e) 24 | if err != nil { 25 | t.Error(err) 26 | } 27 | } 28 | exampleIds := make([]int, 0) 29 | for _, e := range examples { 30 | exampleIds = append(exampleIds, e.Id) 31 | } 32 | err = repo.UpdateTopAccessedExampleIds(exampleIds) 33 | if err != nil { 34 | t.Error(err) 35 | } 36 | 37 | { 38 | top, err := repo.SearchTopAccessedExampleIds() 39 | if err != nil { 40 | t.Error(err) 41 | } 42 | if len(top) != 3 { 43 | t.Error("len(top) must be 3") 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /lib/repository/tweet.go: -------------------------------------------------------------------------------- 1 | package repository 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/lib/pq" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | ) 9 | 10 | func (r *repository) UpdateOrCreateReferringTweets(e *model.Example) error { 11 | if e.ReferringTweets == nil || len((*e).ReferringTweets.Tweets) == 0 || (*e).ReferringTweets.Count == 0 { 12 | return nil 13 | } 14 | 15 | tmp, err := r.FindExampleByUlr(e.Url) 16 | if err != nil { 17 | return err 18 | } 19 | id := tmp.Id 20 | 21 | for _, t := range (*e).ReferringTweets.Tweets { 22 | t.ExampleId = id 23 | if _, err = r.db.NamedExec(` 24 | INSERT INTO tweet 25 | ( example_id, created_at, id_str, full_text, favorite_count, retweet_count, lang, screen_name, name, profile_image_url, label, score) 26 | VALUES 27 | (:example_id, :created_at, :id_str, :full_text, :favorite_count, :retweet_count, :lang, :screen_name, :name, :profile_image_url, :label, :score) 28 | ON CONFLICT (example_id, id_str) 29 | DO UPDATE SET 30 | favorite_count = :favorite_count, retweet_count = :retweet_count, label = :label 31 | WHERE 32 | EXCLUDED.label != 0 AND tweet.label != EXCLUDED.label 33 | ;`, t); err != nil { 34 | return err 35 | } 36 | } 37 | return nil 38 | } 39 | 40 | func (r *repository) UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error { 41 | if _, err := r.db.Exec(`UPDATE tweet SET label = $1 WHERE example_id = $2 AND id_str = $3;`, label, exampleId, idStr); err != nil { 42 | return err 43 | } 44 | return nil 45 | } 46 | 47 | type exampleIdWithTweetsCount struct { 48 | ExampleId int `db:"example_id"` 49 | TweetsCount int `db:"tweets_count"` 50 | } 51 | 52 | func (r *repository) SearchReferringTweetsList(examples model.Examples, limitForEachExample int) (map[int]model.ReferringTweets, error) { 53 | referringTweetsByExampleId := make(map[int]model.ReferringTweets) 54 | 55 | exampleIds := make([]int, 0) 56 | for _, e := range examples { 57 | exampleIds = append(exampleIds, e.Id) 58 | } 59 | 60 | exampleIdsWithTweetsCount := make([]exampleIdWithTweetsCount, 0) 61 | tweetsCountByExampleQuery := `SELECT example_id, COUNT(*) AS tweets_count FROM tweet WHERE example_id = ANY($1) GROUP BY example_id ORDER BY tweets_count DESC;` 62 | err := r.db.Select(&exampleIdsWithTweetsCount, tweetsCountByExampleQuery, pq.Array(exampleIds)) 63 | if err != nil { 64 | return referringTweetsByExampleId, err 65 | } 66 | tweetsCountByExampleId := make(map[int]int) 67 | for _, e := range exampleIdsWithTweetsCount { 68 | tweetsCountByExampleId[e.ExampleId] = e.TweetsCount 69 | } 70 | 71 | if limitForEachExample == 0 { 72 | for _, exampleId := range exampleIds { 73 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)} 74 | if cnt, ok := tweetsCountByExampleId[exampleId]; ok { 75 | referringTweets.Count = cnt 76 | } 77 | referringTweetsByExampleId[exampleId] = referringTweets 78 | } 79 | return referringTweetsByExampleId, nil 80 | } 81 | 82 | tweets := make([]*model.Tweet, 0) 83 | query := `SELECT * FROM tweet WHERE example_id = ANY($1) AND label != -1 AND score > -1.0 AND (lang = 'en' OR lang = 'ja') ORDER BY favorite_count DESC LIMIT $2;` 84 | err = r.db.Select(&tweets, query, pq.Array(exampleIds), limitForEachExample) 85 | if err != nil { 86 | return referringTweetsByExampleId, err 87 | } 88 | tweetsByExampleId := make(map[int][]*model.Tweet) 89 | for _, t := range tweets { 90 | tweetsByExampleId[t.ExampleId] = append(tweetsByExampleId[t.ExampleId], t) 91 | } 92 | 93 | for _, exampleId := range exampleIds { 94 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)} 95 | if tweets, ok := tweetsByExampleId[exampleId]; ok { 96 | referringTweets.Tweets = tweets 97 | } 98 | if cnt, ok := tweetsCountByExampleId[exampleId]; ok { 99 | referringTweets.Count = cnt 100 | } 101 | referringTweetsByExampleId[exampleId] = referringTweets 102 | } 103 | return referringTweetsByExampleId, nil 104 | } 105 | 106 | func (r *repository) SearchReferringTweets(limit int) (model.ReferringTweets, error) { 107 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)} 108 | query := `SELECT * FROM tweet WHERE lang = 'en' OR lang = 'ja' ORDER BY created_at DESC LIMIT $1;` 109 | err := r.db.Select(&referringTweets.Tweets, query, limit) 110 | if err != nil { 111 | return referringTweets, err 112 | } 113 | referringTweets.Count = len(referringTweets.Tweets) 114 | return referringTweets, nil 115 | } 116 | 117 | func (r *repository) SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) { 118 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)} 119 | query := ` 120 | SELECT 121 | tweet.id, 122 | tweet.example_id, 123 | 124 | tweet.created_at, 125 | tweet.id_str, 126 | tweet.full_text, 127 | tweet.favorite_count, 128 | tweet.retweet_count, 129 | tweet.lang, 130 | 131 | tweet.screen_name, 132 | tweet.name, 133 | tweet.profile_image_url, 134 | tweet.label, 135 | tweet.score 136 | FROM 137 | tweet 138 | INNER JOIN 139 | example ON example.id = example_id 140 | WHERE 141 | tweet.created_at > $1 AND 142 | tweet.label != -1 AND 143 | example.label != -1 AND 144 | tweet.score > $2 AND 145 | (favorite_count > 0 OR retweet_count > 0) AND 146 | (lang = 'en' OR lang = 'ja') 147 | ORDER BY tweet.score DESC 148 | LIMIT $3 149 | ; 150 | ` 151 | err := r.db.Select(&referringTweets.Tweets, query, from, scoreThreshold, limit) 152 | if err != nil { 153 | return referringTweets, err 154 | } 155 | referringTweets.Count = len(referringTweets.Tweets) 156 | return referringTweets, nil 157 | } 158 | 159 | func (r *repository) searchReferringTweetsByLabel(label model.LabelType, scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 160 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)} 161 | query := ` 162 | WITH t AS ( 163 | SELECT 164 | id, 165 | ROW_NUMBER() OVER(partition BY example_id ORDER BY favorite_count DESC) AS rank_example_id, 166 | ROW_NUMBER() OVER(partition BY id_str ORDER BY favorite_count DESC) AS rank_id_str 167 | FROM 168 | tweet 169 | WHERE 170 | example_id IN (SELECT id FROM example WHERE label != -1 AND updated_at > NOW() - INTERVAL '30 DAYS') 171 | AND label = $1 AND (lang = 'en' OR lang = 'ja') AND score > $2 172 | ) 173 | 174 | SELECT 175 | * 176 | FROM 177 | tweet 178 | WHERE 179 | id IN (SELECT id FROM t WHERE rank_example_id <= $3 AND rank_id_str = 1) 180 | ORDER BY 181 | created_at DESC 182 | LIMIT $4 183 | ;` 184 | err := r.db.Select(&referringTweets.Tweets, query, label, scoreThreshold, tweetsLimitInSameExample, limit) 185 | if err != nil { 186 | return referringTweets, err 187 | } 188 | referringTweets.Count = len(referringTweets.Tweets) 189 | return referringTweets, nil 190 | } 191 | 192 | func (r *repository) SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 193 | return r.searchReferringTweetsByLabel(model.POSITIVE, scoreThreshold, tweetsLimitInSameExample, limit) 194 | } 195 | 196 | func (r *repository) SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 197 | return r.searchReferringTweetsByLabel(model.NEGATIVE, scoreThreshold, tweetsLimitInSameExample, limit) 198 | } 199 | 200 | func (r *repository) SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 201 | return r.searchReferringTweetsByLabel(model.UNLABELED, scoreThreshold, tweetsLimitInSameExample, limit) 202 | } 203 | 204 | type tweetsCount struct { 205 | Count int `db:"count"` 206 | } 207 | 208 | func (r *repository) FindReferringTweets(e *model.Example, limit int) (model.ReferringTweets, error) { 209 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)} 210 | 211 | countQuery := `SELECT COUNT(*) AS count FROM tweet WHERE example_id = $1;` 212 | cnt := tweetsCount{} 213 | err := r.db.Get(&cnt, countQuery, e.Id) 214 | if err != nil { 215 | return referringTweets, err 216 | } 217 | referringTweets.Count = cnt.Count 218 | if limit == 0 { 219 | return referringTweets, err 220 | } 221 | 222 | query := `SELECT * FROM tweet WHERE example_id = $1 AND label != -1 AND score > 0.0 AND (lang = 'en' OR lang = 'ja') ORDER BY favorite_count DESC LIMIT $2;` 223 | err = r.db.Select(&referringTweets.Tweets, query, e.Id, limit) 224 | if err != nil { 225 | return referringTweets, err 226 | } 227 | return referringTweets, nil 228 | } 229 | -------------------------------------------------------------------------------- /lib/repository/tweet_test.go: -------------------------------------------------------------------------------- 1 | package repository_test 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/syou6162/go-active-learning/lib/example" 8 | "github.com/syou6162/go-active-learning/lib/model" 9 | "github.com/syou6162/go-active-learning/lib/repository" 10 | ) 11 | 12 | func TestUpdateReferringTweets(t *testing.T) { 13 | repo, err := repository.New() 14 | if err != nil { 15 | t.Errorf(err.Error()) 16 | } 17 | defer repo.Close() 18 | 19 | if err = repo.DeleteAllExamples(); err != nil { 20 | t.Error(err) 21 | } 22 | 23 | e := example.NewExample("http://hoge.com", model.UNLABELED) 24 | err = repo.UpdateOrCreateExample(e) 25 | if err != nil { 26 | t.Error(err) 27 | } 28 | now := time.Now() 29 | idStr := "1111111" 30 | t1 := model.Tweet{ 31 | CreatedAt: now, 32 | IdStr: idStr, 33 | FullText: "hello world!!!", 34 | FavoriteCount: 10, 35 | RetweetCount: 10, 36 | Lang: "en", 37 | ScreenName: "syou6162", 38 | Name: "syou6162", 39 | ProfileImageUrl: "http://hogehoge.com/profile.png", 40 | Score: 1.0, 41 | } 42 | 43 | tweets := model.ReferringTweets{} 44 | tweets.Tweets = append(tweets.Tweets, &t1) 45 | tweets.Count = len(tweets.Tweets) 46 | e.ReferringTweets = &tweets 47 | if err = repo.UpdateOrCreateReferringTweets(e); err != nil { 48 | t.Error(err) 49 | } 50 | 51 | { 52 | result, err := repo.SearchReferringTweetsList(model.Examples{e}, 10) 53 | if err != nil { 54 | t.Error(err) 55 | } 56 | if len(result) == 0 { 57 | t.Error("result must not be empty") 58 | } 59 | if len(result[e.Id].Tweets) == 0 { 60 | t.Error("result must not be empty") 61 | } 62 | if result[e.Id].Count == 0 { 63 | t.Error("result must not be zero") 64 | } 65 | if result[e.Id].Tweets[0].Name != "syou6162" { 66 | t.Error("Name must be syou6162") 67 | } 68 | } 69 | 70 | { 71 | result, err := repo.FindReferringTweets(e, 10) 72 | if err != nil { 73 | t.Error(err) 74 | } 75 | if len(result.Tweets) == 0 { 76 | t.Error("result must not be empty") 77 | } 78 | if result.Count == 0 { 79 | t.Error("result must not be empty") 80 | } 81 | if result.Tweets[0].Name != "syou6162" { 82 | t.Error("Name must be syou6162") 83 | } 84 | } 85 | 86 | { 87 | result, err := repo.FindReferringTweets(e, 0) 88 | if err != nil { 89 | t.Error(err) 90 | } 91 | if len(result.Tweets) != 0 { 92 | t.Error("result must be empty") 93 | } 94 | if result.Count == 0 { 95 | t.Error("result must not be empty") 96 | } 97 | } 98 | 99 | { 100 | if err := repo.UpdateTweetLabel(e.Id, idStr, model.NEGATIVE); err != nil { 101 | t.Error(err) 102 | } 103 | result, err := repo.FindReferringTweets(e, 10) 104 | if err != nil { 105 | t.Error(err) 106 | } 107 | if len(result.Tweets) != 0 { 108 | t.Error("result must be empty") 109 | } 110 | if result.Count != 1 { 111 | t.Error("result must be 1") 112 | } 113 | } 114 | } 115 | 116 | func TestSearchReferringTweetsByLabel(t *testing.T) { 117 | repo, err := repository.New() 118 | if err != nil { 119 | t.Errorf(err.Error()) 120 | } 121 | defer repo.Close() 122 | 123 | if err = repo.DeleteAllExamples(); err != nil { 124 | t.Error(err) 125 | } 126 | 127 | e := example.NewExample("http://hoge.com", model.UNLABELED) 128 | err = repo.UpdateOrCreateExample(e) 129 | if err != nil { 130 | t.Error(err) 131 | } 132 | now := time.Now() 133 | idStr := "1111111" 134 | t1 := model.Tweet{ 135 | CreatedAt: now, 136 | IdStr: idStr, 137 | FullText: "hello world!!!", 138 | FavoriteCount: 10, 139 | RetweetCount: 10, 140 | Lang: "en", 141 | ScreenName: "syou6162", 142 | Name: "syou6162", 143 | ProfileImageUrl: "http://hogehoge.com/profile.png", 144 | Label: model.POSITIVE, 145 | } 146 | 147 | tweets := model.ReferringTweets{} 148 | tweets.Tweets = append(tweets.Tweets, &t1) 149 | tweets.Count = len(tweets.Tweets) 150 | e.ReferringTweets = &tweets 151 | if err = repo.UpdateOrCreateReferringTweets(e); err != nil { 152 | t.Error(err) 153 | } 154 | 155 | limit := 10 156 | { 157 | result, err := repo.SearchPositiveReferringTweets(-1.0, 3, limit) 158 | if err != nil { 159 | t.Error(err) 160 | } 161 | if len(result.Tweets) != 1 { 162 | t.Error("len(result) must be 1") 163 | } 164 | if result.Count != 1 { 165 | t.Error("Count must be 1") 166 | } 167 | } 168 | { 169 | result, err := repo.SearchNegativeReferringTweets(-1.0, 3, limit) 170 | if err != nil { 171 | t.Error(err) 172 | } 173 | if len(result.Tweets) != 0 { 174 | t.Error("len(result) must be empty") 175 | } 176 | if result.Count != 0 { 177 | t.Error("Count must be zero") 178 | } 179 | } 180 | } 181 | 182 | func TestSearchRecentReferringTweetsWithHighScore(t *testing.T) { 183 | repo, err := repository.New() 184 | if err != nil { 185 | t.Errorf(err.Error()) 186 | } 187 | defer repo.Close() 188 | 189 | if err = repo.DeleteAllExamples(); err != nil { 190 | t.Error(err) 191 | } 192 | 193 | e := example.NewExample("http://hoge.com", model.UNLABELED) 194 | err = repo.UpdateOrCreateExample(e) 195 | if err != nil { 196 | t.Error(err) 197 | } 198 | now := time.Now() 199 | t1 := model.Tweet{ 200 | CreatedAt: now, 201 | IdStr: "1111111", 202 | FullText: "hello world!!!", 203 | FavoriteCount: 10, 204 | RetweetCount: 10, 205 | Lang: "en", 206 | ScreenName: "syou6162", 207 | Name: "syou6162", 208 | ProfileImageUrl: "http://hogehoge.com/profile.png", 209 | Label: model.POSITIVE, 210 | Score: 10.0, 211 | } 212 | t2 := model.Tweet{ 213 | CreatedAt: now, 214 | IdStr: "22222222", 215 | FullText: "hello world!!!", 216 | FavoriteCount: 10, 217 | RetweetCount: 10, 218 | Lang: "en", 219 | ScreenName: "syou6162", 220 | Name: "syou6162", 221 | ProfileImageUrl: "http://hogehoge.com/profile.png", 222 | Label: model.POSITIVE, 223 | Score: 10.0, 224 | } 225 | t3 := model.Tweet{ 226 | CreatedAt: now, 227 | IdStr: "3333333333", 228 | FullText: "hello world!!!", 229 | FavoriteCount: 10, 230 | RetweetCount: 10, 231 | Lang: "en", 232 | ScreenName: "syou6162", 233 | Name: "syou6162", 234 | ProfileImageUrl: "http://hogehoge.com/profile.png", 235 | Label: model.POSITIVE, 236 | Score: -10.0, 237 | } 238 | 239 | tweets := model.ReferringTweets{} 240 | tweets.Tweets = append(tweets.Tweets, &t1, &t2, &t3) 241 | tweets.Count = len(tweets.Tweets) 242 | e.ReferringTweets = &tweets 243 | if err = repo.UpdateOrCreateReferringTweets(e); err != nil { 244 | t.Error(err) 245 | } 246 | 247 | limit := 10 248 | { 249 | result, err := repo.SearchRecentReferringTweetsWithHighScore(now.Add(time.Duration(-10*24)*time.Hour), 0.0, limit) 250 | if err != nil { 251 | t.Error(err) 252 | } 253 | if len(result.Tweets) != 2 { 254 | t.Error("len(result) must be 2") 255 | } 256 | if result.Count != 2 { 257 | t.Error("Count must be 2") 258 | } 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /lib/service/example.go: -------------------------------------------------------------------------------- 1 | package service 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "log" 8 | "net/http" 9 | "runtime" 10 | "time" 11 | 12 | "math" 13 | "os" 14 | "strconv" 15 | "sync" 16 | 17 | "github.com/syou6162/go-active-learning/lib/example" 18 | "github.com/syou6162/go-active-learning/lib/fetcher" 19 | "github.com/syou6162/go-active-learning/lib/model" 20 | "github.com/syou6162/go-active-learning/lib/util" 21 | ) 22 | 23 | func (app *goActiveLearningApp) UpdateOrCreateExample(e *model.Example) error { 24 | return app.repo.UpdateOrCreateExample(e) 25 | } 26 | 27 | func (app *goActiveLearningApp) UpdateScore(e *model.Example) error { 28 | return app.repo.UpdateScore(e) 29 | } 30 | 31 | func (app *goActiveLearningApp) InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) { 32 | return app.repo.InsertExampleFromScanner(scanner) 33 | } 34 | 35 | func (app *goActiveLearningApp) InsertExamplesFromReader(reader io.Reader) error { 36 | return app.repo.InsertExamplesFromReader(reader) 37 | } 38 | 39 | func (app *goActiveLearningApp) SearchExamples() (model.Examples, error) { 40 | return app.repo.SearchExamples() 41 | } 42 | 43 | func (app *goActiveLearningApp) SearchRecentExamples(from time.Time, limit int) (model.Examples, error) { 44 | return app.repo.SearchRecentExamples(from, limit) 45 | } 46 | 47 | func (app *goActiveLearningApp) SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) { 48 | return app.repo.SearchRecentExamplesByHost(host, from, limit) 49 | } 50 | 51 | func (app *goActiveLearningApp) SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) { 52 | return app.repo.SearchExamplesByLabel(label, limit) 53 | } 54 | 55 | func (app *goActiveLearningApp) SearchLabeledExamples(limit int) (model.Examples, error) { 56 | return app.repo.SearchLabeledExamples(limit) 57 | } 58 | 59 | func (app *goActiveLearningApp) SearchPositiveExamples(limit int) (model.Examples, error) { 60 | return app.repo.SearchPositiveExamples(limit) 61 | } 62 | 63 | func (app *goActiveLearningApp) SearchNegativeExamples(limit int) (model.Examples, error) { 64 | return app.repo.SearchNegativeExamples(limit) 65 | } 66 | 67 | func (app *goActiveLearningApp) SearchUnlabeledExamples(limit int) (model.Examples, error) { 68 | return app.repo.SearchUnlabeledExamples(limit) 69 | } 70 | 71 | func (app *goActiveLearningApp) SearchPositiveScoredExamples(limit int) (model.Examples, error) { 72 | return app.repo.SearchPositiveScoredExamples(limit) 73 | } 74 | 75 | func (app *goActiveLearningApp) FindExampleByUlr(url string) (*model.Example, error) { 76 | return app.repo.FindExampleByUlr(url) 77 | } 78 | 79 | func (app *goActiveLearningApp) FindExampleById(id int) (*model.Example, error) { 80 | return app.repo.FindExampleById(id) 81 | } 82 | 83 | func (app *goActiveLearningApp) SearchExamplesByUlrs(urls []string) (model.Examples, error) { 84 | return app.repo.SearchExamplesByUlrs(urls) 85 | } 86 | 87 | func (app *goActiveLearningApp) SearchExamplesByIds(ids []int) (model.Examples, error) { 88 | return app.repo.SearchExamplesByIds(ids) 89 | } 90 | 91 | func (app *goActiveLearningApp) SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) { 92 | return app.repo.SearchExamplesByKeywords(keywords, aggregator, limit) 93 | } 94 | 95 | func (app *goActiveLearningApp) DeleteAllExamples() error { 96 | return app.repo.DeleteAllExamples() 97 | } 98 | 99 | func (app *goActiveLearningApp) CountPositiveExamples() (int, error) { 100 | return app.repo.CountPositiveExamples() 101 | } 102 | 103 | func (app *goActiveLearningApp) CountNegativeExamples() (int, error) { 104 | return app.repo.CountNegativeExamples() 105 | } 106 | 107 | func (app *goActiveLearningApp) CountUnlabeledExamples() (int, error) { 108 | return app.repo.CountUnlabeledExamples() 109 | } 110 | 111 | func (app *goActiveLearningApp) UpdateFeatureVector(e *model.Example) error { 112 | return app.repo.UpdateFeatureVector(e) 113 | } 114 | 115 | func (app *goActiveLearningApp) UpdateHatenaBookmark(e *model.Example) error { 116 | return app.repo.UpdateHatenaBookmark(e) 117 | } 118 | 119 | func (app *goActiveLearningApp) UpdateOrCreateReferringTweets(e *model.Example) error { 120 | return app.repo.UpdateOrCreateReferringTweets(e) 121 | } 122 | 123 | func (app *goActiveLearningApp) UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error { 124 | return app.repo.UpdateTweetLabel(exampleId, idStr, label) 125 | } 126 | 127 | func (app *goActiveLearningApp) SearchReferringTweets(limit int) (model.ReferringTweets, error) { 128 | return app.repo.SearchReferringTweets(limit) 129 | } 130 | 131 | func (app *goActiveLearningApp) SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 132 | return app.repo.SearchPositiveReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit) 133 | } 134 | 135 | func (app *goActiveLearningApp) SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 136 | return app.repo.SearchNegativeReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit) 137 | } 138 | 139 | func (app *goActiveLearningApp) SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) { 140 | return app.repo.SearchUnlabeledReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit) 141 | } 142 | 143 | func (app *goActiveLearningApp) SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) { 144 | return app.repo.SearchRecentReferringTweetsWithHighScore(from, scoreThreshold, limit) 145 | } 146 | 147 | func hatenaBookmarkByExampleId(hatenaBookmarks []*model.HatenaBookmark) map[int]*model.HatenaBookmark { 148 | result := make(map[int]*model.HatenaBookmark) 149 | for _, hb := range hatenaBookmarks { 150 | result[hb.ExampleId] = hb 151 | } 152 | return result 153 | } 154 | 155 | func (app *goActiveLearningApp) AttachMetadataIncludingFeatureVector(examples model.Examples, bookmarkLimit int, tweetLimit int) error { 156 | // make sure that example id must be filled 157 | for _, e := range examples { 158 | if e.Id == 0 { 159 | tmp, err := app.FindExampleByUlr(e.Url) 160 | if err != nil { 161 | return err 162 | } 163 | e.Id = tmp.Id 164 | } 165 | } 166 | 167 | fvList, err := app.repo.SearchFeatureVector(examples) 168 | if err != nil { 169 | return err 170 | } 171 | 172 | for _, e := range examples { 173 | if fv, ok := fvList[e.Id]; ok { 174 | e.Fv = fv 175 | } 176 | } 177 | 178 | return app.AttachMetadata(examples, bookmarkLimit, tweetLimit) 179 | } 180 | 181 | func (app *goActiveLearningApp) AttachMetadata(examples model.Examples, bookmarkLimit int, tweetLimit int) error { 182 | hatenaBookmarks, err := app.repo.SearchHatenaBookmarks(examples, bookmarkLimit) 183 | if err != nil { 184 | return err 185 | } 186 | hbByid := hatenaBookmarkByExampleId(hatenaBookmarks) 187 | for _, e := range examples { 188 | if b, ok := hbByid[e.Id]; ok { 189 | e.HatenaBookmark = b 190 | } else { 191 | e.HatenaBookmark = &model.HatenaBookmark{Bookmarks: []*model.Bookmark{}} 192 | } 193 | } 194 | 195 | referringTweetsById, err := app.repo.SearchReferringTweetsList(examples, tweetLimit) 196 | if err != nil { 197 | return err 198 | } 199 | for _, e := range examples { 200 | if t, ok := referringTweetsById[e.Id]; ok { 201 | e.ReferringTweets = &t 202 | } else { 203 | e.ReferringTweets = &model.ReferringTweets{} 204 | } 205 | } 206 | return nil 207 | } 208 | 209 | func (app *goActiveLearningApp) UpdateRelatedExamples(related model.RelatedExamples) error { 210 | return app.repo.UpdateRelatedExamples(related) 211 | } 212 | 213 | func (app *goActiveLearningApp) SearchRelatedExamples(e *model.Example) (model.Examples, error) { 214 | related, err := app.repo.FindRelatedExamples(e) 215 | if err != nil { 216 | return nil, err 217 | } 218 | return app.repo.SearchExamplesByIds(related.RelatedExampleIds) 219 | } 220 | 221 | func (app *goActiveLearningApp) UpdateTopAccessedExampleIds(exampleIds []int) error { 222 | return app.repo.UpdateTopAccessedExampleIds(exampleIds) 223 | } 224 | 225 | func (app *goActiveLearningApp) SearchTopAccessedExamples() (model.Examples, error) { 226 | exampleIds, err := app.repo.SearchTopAccessedExampleIds() 227 | if err != nil { 228 | return nil, err 229 | } 230 | return app.repo.SearchExamplesByIds(exampleIds) 231 | } 232 | 233 | func (app *goActiveLearningApp) UpdateRecommendation(listName string, examples model.Examples) error { 234 | listType, err := model.GetRecommendationListType(listName) 235 | if err != nil { 236 | return err 237 | } 238 | 239 | exampleIds := make([]int, 0) 240 | for _, e := range examples { 241 | exampleIds = append(exampleIds, e.Id) 242 | } 243 | 244 | rec := model.Recommendation{RecommendationListType: listType, ExampleIds: exampleIds} 245 | return app.repo.UpdateRecommendation(rec) 246 | } 247 | 248 | func (app *goActiveLearningApp) GetRecommendation(listName string) (model.Examples, error) { 249 | listType, err := model.GetRecommendationListType(listName) 250 | if err != nil { 251 | return nil, err 252 | } 253 | rec, err := app.repo.FindRecommendation(listType) 254 | return app.repo.SearchExamplesByIds(rec.ExampleIds) 255 | } 256 | 257 | func (app *goActiveLearningApp) splitExamplesByStatusOK(examples model.Examples) (model.Examples, model.Examples, error) { 258 | urls := make([]string, 0) 259 | exampleByurl := make(map[string]*model.Example) 260 | for _, e := range examples { 261 | exampleByurl[e.Url] = e 262 | urls = append(urls, e.Url) 263 | } 264 | tmpExamples, err := app.SearchExamplesByUlrs(urls) 265 | if err != nil { 266 | return nil, nil, err 267 | } 268 | 269 | examplesWithMetaData := model.Examples{} 270 | examplesWithEmptyMetaData := model.Examples{} 271 | for _, e := range tmpExamples { 272 | if e.StatusCode == http.StatusOK { 273 | examplesWithMetaData = append(examplesWithMetaData, exampleByurl[e.Url]) 274 | delete(exampleByurl, e.Url) 275 | } else { 276 | examplesWithEmptyMetaData = append(examplesWithEmptyMetaData, exampleByurl[e.Url]) 277 | delete(exampleByurl, e.Url) 278 | } 279 | } 280 | for _, e := range exampleByurl { 281 | examplesWithEmptyMetaData = append(examplesWithEmptyMetaData, e) 282 | } 283 | return examplesWithMetaData, examplesWithEmptyMetaData, nil 284 | } 285 | 286 | func fetchMetaData(e *model.Example) error { 287 | article, err := fetcher.GetArticle(e.Url) 288 | if err != nil { 289 | return err 290 | } 291 | 292 | e.Title = article.Title 293 | e.FinalUrl = article.Url 294 | e.Description = article.Description 295 | e.OgDescription = article.OgDescription 296 | e.OgType = article.OgType 297 | e.OgImage = article.OgImage 298 | e.Body = article.Body 299 | e.StatusCode = article.StatusCode 300 | e.Favicon = article.Favicon 301 | 302 | now := time.Now() 303 | tooOldDate := time.Date(2000, time.January, 1, 1, 1, 0, 0, time.UTC) 304 | if article.PublishDate != nil && (now.After(*article.PublishDate) || tooOldDate.Before(*article.PublishDate)) { 305 | e.CreatedAt = *article.PublishDate 306 | e.UpdatedAt = *article.PublishDate 307 | } 308 | 309 | fv := util.RemoveDuplicate(example.ExtractFeatures(*e)) 310 | if len(fv) > 100000 { 311 | return fmt.Errorf("too large features (N = %d) for %s", len(fv), e.FinalUrl) 312 | } 313 | e.Fv = fv 314 | 315 | return nil 316 | } 317 | 318 | func (app *goActiveLearningApp) Fetch(examples model.Examples) { 319 | batchSize := 100 320 | examplesList := make([]model.Examples, 0) 321 | n := len(examples) 322 | 323 | for i := 0; i < n; i += batchSize { 324 | max := int(math.Min(float64(i+batchSize), float64(n))) 325 | examplesList = append(examplesList, examples[i:max]) 326 | } 327 | for _, l := range examplesList { 328 | examplesWithMetaData, examplesWithEmptyMetaData, err := app.splitExamplesByStatusOK(l) 329 | if err != nil { 330 | log.Println(err.Error()) 331 | } 332 | // ToDo: 本当に必要か考える 333 | app.AttachMetadataIncludingFeatureVector(examplesWithMetaData, 0, 0) 334 | 335 | wg := &sync.WaitGroup{} 336 | cpus := runtime.NumCPU() 337 | runtime.GOMAXPROCS(cpus) 338 | sem := make(chan struct{}, batchSize) 339 | for idx, e := range examplesWithEmptyMetaData { 340 | wg.Add(1) 341 | sem <- struct{}{} 342 | go func(e *model.Example, idx int) { 343 | defer wg.Done() 344 | cnt, err := app.repo.GetErrorCount(e) 345 | if err != nil { 346 | log.Println(err.Error()) 347 | } 348 | if cnt < 5 { 349 | fmt.Fprintln(os.Stderr, "Fetching("+strconv.Itoa(idx)+"): "+e.Url) 350 | if err := fetchMetaData(e); err != nil { 351 | app.repo.IncErrorCount(e) 352 | log.Println(err.Error()) 353 | } 354 | } 355 | <-sem 356 | }(e, idx) 357 | } 358 | wg.Wait() 359 | } 360 | } 361 | -------------------------------------------------------------------------------- /lib/service/example_test.go: -------------------------------------------------------------------------------- 1 | package service_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/example" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | "github.com/syou6162/go-active-learning/lib/service" 9 | ) 10 | 11 | func findExampleByurl(examples model.Examples, url string) *model.Example { 12 | for _, e := range examples { 13 | if e.Url == url { 14 | return e 15 | } 16 | } 17 | return nil 18 | } 19 | 20 | func TestAttachMetaData(t *testing.T) { 21 | app, err := service.NewDefaultApp() 22 | if err != nil { 23 | t.Error(err) 24 | } 25 | defer app.Close() 26 | if err := app.DeleteAllExamples(); err != nil { 27 | t.Error("Cannot delete examples") 28 | } 29 | 30 | hatebuUrl := "https://b.hatena.ne.jp" 31 | myBlogUrl := "https://www.yasuhisay.info" 32 | githubUrl := "https://github.com" 33 | e1 := example.NewExample(hatebuUrl, model.POSITIVE) 34 | e2 := example.NewExample(myBlogUrl, model.NEGATIVE) 35 | e3 := example.NewExample(githubUrl, model.UNLABELED) 36 | examples := model.Examples{e1, e2, e3} 37 | 38 | hatebu := findExampleByurl(examples, hatebuUrl) 39 | if hatebu == nil { 40 | t.Errorf("Cannot find %s", hatebuUrl) 41 | } 42 | if hatebu.Title != "" { 43 | t.Errorf("Title must be empty for %s", hatebu.Url) 44 | } 45 | if len(hatebu.Fv) != 0 { 46 | t.Errorf("Feature vector must be empty for %s", hatebu.Url) 47 | } 48 | app.AttachMetadataIncludingFeatureVector(examples, 10, 10) 49 | 50 | if hatebu.Title != "" { 51 | t.Errorf("Title must be empty for %s", hatebu.Url) 52 | } 53 | if len(hatebu.Fv) != 0 { 54 | t.Errorf("Feature vector must be empty for %s", hatebu.Url) 55 | } 56 | 57 | myBlog := findExampleByurl(examples, myBlogUrl) 58 | if myBlog == nil { 59 | t.Errorf("Cannot find %s", myBlogUrl) 60 | } 61 | if myBlog.OgType != "" { 62 | t.Errorf("OgType must be empty for %s", myBlog.Url) 63 | } 64 | 65 | app.Fetch(examples) 66 | for _, e := range examples { 67 | err = app.UpdateOrCreateExample(e) 68 | if err != nil { 69 | t.Error(err) 70 | } 71 | err = app.UpdateFeatureVector(e) 72 | if err != nil { 73 | t.Error(err) 74 | } 75 | } 76 | if hatebu.Title == "" { 77 | t.Errorf("Title must not be empty for %s", hatebu.Url) 78 | } 79 | if len(hatebu.Fv) == 0 { 80 | t.Errorf("Feature vector must not be empty for %s", hatebu.Url) 81 | } 82 | 83 | if myBlog.OgType != "blog" { 84 | t.Errorf("OgType must be blog for %s", myBlog.Url) 85 | } 86 | 87 | examples, err = app.SearchExamplesByIds([]int{e1.Id, e2.Id, e3.Id}) 88 | if err != nil { 89 | t.Error(err) 90 | } 91 | err = app.AttachMetadataIncludingFeatureVector(examples, 10, 10) 92 | if err != nil { 93 | t.Error(err) 94 | } 95 | 96 | if hatebu.Title == "" { 97 | t.Errorf("Title must be empty for %s", hatebu.Url) 98 | } 99 | if len(hatebu.Fv) == 0 { 100 | t.Errorf("Feature vector must not be empty for %s", hatebu.Url) 101 | } 102 | 103 | if myBlog.OgType != "blog" { 104 | t.Errorf("OgType must be blog for %s", myBlog.Url) 105 | } 106 | } 107 | 108 | func TestGetRecommendation(t *testing.T) { 109 | app, err := service.NewDefaultApp() 110 | if err != nil { 111 | t.Error(err) 112 | } 113 | defer app.Close() 114 | if err := app.DeleteAllExamples(); err != nil { 115 | t.Error("Cannot delete examples") 116 | } 117 | 118 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE) 119 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE) 120 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED) 121 | examples := model.Examples{e1, e2, e3} 122 | for _, e := range examples { 123 | err = app.UpdateOrCreateExample(e) 124 | if err != nil { 125 | t.Error(err) 126 | } 127 | } 128 | 129 | listName := "general" 130 | err = app.UpdateRecommendation(listName, examples) 131 | if err != nil { 132 | t.Error(err) 133 | } 134 | examples, err = app.GetRecommendation(listName) 135 | if err != nil { 136 | t.Error(err) 137 | } 138 | if len(examples) != 3 { 139 | t.Errorf("len(examples) should be 3, but %d", len(examples)) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /lib/service/service.go: -------------------------------------------------------------------------------- 1 | package service 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | "time" 7 | 8 | "github.com/syou6162/go-active-learning/lib/classifier" 9 | "github.com/syou6162/go-active-learning/lib/model" 10 | "github.com/syou6162/go-active-learning/lib/repository" 11 | ) 12 | 13 | type GoActiveLearningApp interface { 14 | UpdateOrCreateExample(e *model.Example) error 15 | UpdateScore(e *model.Example) error 16 | InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) 17 | InsertExamplesFromReader(reader io.Reader) error 18 | SearchExamples() (model.Examples, error) 19 | SearchRecentExamples(from time.Time, limit int) (model.Examples, error) 20 | SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) 21 | SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) 22 | SearchLabeledExamples(limit int) (model.Examples, error) 23 | SearchPositiveExamples(limit int) (model.Examples, error) 24 | SearchNegativeExamples(limit int) (model.Examples, error) 25 | SearchUnlabeledExamples(limit int) (model.Examples, error) 26 | SearchPositiveScoredExamples(limit int) (model.Examples, error) 27 | FindExampleByUlr(url string) (*model.Example, error) 28 | FindExampleById(id int) (*model.Example, error) 29 | SearchExamplesByUlrs(urls []string) (model.Examples, error) 30 | SearchExamplesByIds(ids []int) (model.Examples, error) 31 | SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) 32 | DeleteAllExamples() error 33 | CountPositiveExamples() (int, error) 34 | CountNegativeExamples() (int, error) 35 | CountUnlabeledExamples() (int, error) 36 | 37 | InsertMIRAModel(m classifier.MIRAClassifier) error 38 | FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) 39 | 40 | UpdateFeatureVector(e *model.Example) error 41 | UpdateHatenaBookmark(e *model.Example) error 42 | UpdateOrCreateReferringTweets(e *model.Example) error 43 | UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error 44 | SearchReferringTweets(limit int) (model.ReferringTweets, error) 45 | SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) 46 | SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) 47 | SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) 48 | SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) 49 | Fetch(examples model.Examples) 50 | 51 | AttachMetadataIncludingFeatureVector(examples model.Examples, bookmarkLimit int, tweetLimit int) error 52 | AttachMetadata(examples model.Examples, bookmarkLimit, tweetLimit int) error 53 | 54 | UpdateRecommendation(listName string, examples model.Examples) error 55 | GetRecommendation(listName string) (model.Examples, error) 56 | 57 | UpdateRelatedExamples(related model.RelatedExamples) error 58 | SearchRelatedExamples(e *model.Example) (model.Examples, error) 59 | 60 | UpdateTopAccessedExampleIds(exampleIds []int) error 61 | SearchTopAccessedExamples() (model.Examples, error) 62 | 63 | Ping() error 64 | Close() error 65 | } 66 | 67 | func NewApp(repo repository.Repository) GoActiveLearningApp { 68 | return &goActiveLearningApp{repo: repo} 69 | } 70 | 71 | func NewDefaultApp() (GoActiveLearningApp, error) { 72 | repo, err := repository.New() 73 | if err != nil { 74 | return nil, err 75 | } 76 | return &goActiveLearningApp{repo: repo}, nil 77 | } 78 | 79 | type goActiveLearningApp struct { 80 | repo repository.Repository 81 | } 82 | 83 | func (app *goActiveLearningApp) InsertMIRAModel(m classifier.MIRAClassifier) error { 84 | return app.repo.InsertMIRAModel(m) 85 | } 86 | 87 | func (app *goActiveLearningApp) FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) { 88 | return app.repo.FindLatestMIRAModel(modelType) 89 | } 90 | 91 | func (app *goActiveLearningApp) Ping() error { 92 | if err := app.repo.Ping(); err != nil { 93 | return err 94 | } 95 | return nil 96 | } 97 | 98 | func (app *goActiveLearningApp) Close() error { 99 | if err := app.repo.Close(); err != nil { 100 | return err 101 | } 102 | return nil 103 | } 104 | -------------------------------------------------------------------------------- /lib/top_accessed_example/top_accessed_example.go: -------------------------------------------------------------------------------- 1 | package top_accessed_example 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "strconv" 7 | 8 | "os" 9 | 10 | "github.com/syou6162/go-active-learning/lib/service" 11 | "github.com/urfave/cli" 12 | ) 13 | 14 | func parseLine(line string) (int, error) { 15 | exampleId, err := strconv.ParseInt(line, 10, 0) 16 | if err != nil { 17 | return 0, fmt.Errorf("Invalid line: %s", line) 18 | } 19 | return int(exampleId), nil 20 | } 21 | 22 | func readTopAccessedExampleIds(filename string) ([]int, error) { 23 | fp, err := os.Open(filename) 24 | defer fp.Close() 25 | if err != nil { 26 | return nil, err 27 | } 28 | 29 | exampleIds := make([]int, 0) 30 | scanner := bufio.NewScanner(fp) 31 | for scanner.Scan() { 32 | line := scanner.Text() 33 | exampleId, err := parseLine(line) 34 | if err != nil { 35 | return nil, err 36 | } 37 | exampleIds = append(exampleIds, exampleId) 38 | } 39 | if err := scanner.Err(); err != nil { 40 | return nil, err 41 | } 42 | return exampleIds, nil 43 | } 44 | 45 | func doAddTopAccessedExamples(c *cli.Context) error { 46 | inputFilename := c.String("input-filename") 47 | 48 | if inputFilename == "" { 49 | _ = cli.ShowCommandHelp(c, "add-top-accessed-examples") 50 | return cli.NewExitError("`input-filename` is a required field.", 1) 51 | } 52 | 53 | app, err := service.NewDefaultApp() 54 | if err != nil { 55 | return err 56 | } 57 | defer app.Close() 58 | 59 | exampleIds, err := readTopAccessedExampleIds(inputFilename) 60 | if err != nil { 61 | return err 62 | } 63 | err = app.UpdateTopAccessedExampleIds(exampleIds) 64 | if err != nil { 65 | return err 66 | } 67 | return nil 68 | } 69 | 70 | var CommandAddTopAccessedExamples = cli.Command{ 71 | Name: "add-top-accessed-examples", 72 | Usage: "add top accessed examples", 73 | Description: ` 74 | Add top accessed examples. 75 | `, 76 | Action: doAddTopAccessedExamples, 77 | Flags: []cli.Flag{ 78 | cli.StringFlag{Name: "input-filename"}, 79 | }, 80 | } 81 | -------------------------------------------------------------------------------- /lib/util/converter/converter.go: -------------------------------------------------------------------------------- 1 | package converter 2 | 3 | import "github.com/syou6162/go-active-learning/lib/model" 4 | import "github.com/syou6162/go-active-learning/lib/classifier" 5 | 6 | func ConvertExamplesToLearningInstances(examples model.Examples) classifier.LearningInstances { 7 | instances := classifier.LearningInstances{} 8 | for _, e := range examples { 9 | instances = append(instances, e) 10 | } 11 | return instances 12 | } 13 | -------------------------------------------------------------------------------- /lib/util/file/file.go: -------------------------------------------------------------------------------- 1 | package file 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/syou6162/go-active-learning/lib/example" 12 | "github.com/syou6162/go-active-learning/lib/model" 13 | ) 14 | 15 | func ParseLine(line string) (*model.Example, error) { 16 | tokens := strings.Split(line, "\t") 17 | var url string 18 | if len(tokens) == 1 { 19 | url = tokens[0] 20 | return example.NewExample(url, model.UNLABELED), nil 21 | } else if len(tokens) == 2 { 22 | url = tokens[0] 23 | label, _ := strconv.ParseInt(tokens[1], 10, 0) 24 | switch model.LabelType(label) { 25 | case model.POSITIVE, model.NEGATIVE, model.UNLABELED: 26 | return example.NewExample(url, model.LabelType(label)), nil 27 | default: 28 | return nil, errors.New(fmt.Sprintf("Invalid Label type %d in %s", label, line)) 29 | } 30 | } else { 31 | return nil, errors.New(fmt.Sprintf("Invalid line: %s", line)) 32 | } 33 | } 34 | 35 | func ReadExamples(filename string) ([]*model.Example, error) { 36 | fp, err := os.Open(filename) 37 | defer fp.Close() 38 | if err != nil { 39 | return nil, err 40 | } 41 | 42 | scanner := bufio.NewScanner(fp) 43 | var examples model.Examples 44 | for scanner.Scan() { 45 | line := scanner.Text() 46 | e, err := ParseLine(line) 47 | if err != nil { 48 | return nil, err 49 | } 50 | examples = append(examples, e) 51 | } 52 | if err := scanner.Err(); err != nil { 53 | return nil, err 54 | } 55 | return examples, nil 56 | } 57 | 58 | func WriteExamples(examples model.Examples, filename string) error { 59 | fp, err := os.Create(filename) 60 | defer fp.Close() 61 | if err != nil { 62 | return err 63 | } 64 | 65 | writer := bufio.NewWriter(fp) 66 | for _, e := range examples { 67 | if e.IsNew && e.IsLabeled() { 68 | url := e.FinalUrl 69 | if url == "" { 70 | url = e.Url 71 | } 72 | _, err := writer.WriteString(url + "\t" + strconv.Itoa(int(e.Label)) + "\n") 73 | if err != nil { 74 | return err 75 | } 76 | } 77 | } 78 | 79 | writer.Flush() 80 | return nil 81 | } 82 | -------------------------------------------------------------------------------- /lib/util/file/file_test.go: -------------------------------------------------------------------------------- 1 | package file 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/syou6162/go-active-learning/lib/example" 8 | "github.com/syou6162/go-active-learning/lib/model" 9 | ) 10 | 11 | func TestParseLine(t *testing.T) { 12 | line1 := "http://model.com\t1" 13 | e, err := ParseLine(line1) 14 | 15 | if err != nil { 16 | t.Error("cannot parse line1") 17 | } 18 | if e.Label != model.POSITIVE { 19 | t.Error("Label must be POSITIVE") 20 | } 21 | 22 | line2 := "http://model.com\t-1" 23 | e, err = ParseLine(line2) 24 | 25 | if err != nil { 26 | t.Error("cannot parse line2") 27 | } 28 | if e.Label != model.NEGATIVE { 29 | t.Error("Label must be NEGATIVE") 30 | } 31 | 32 | line3 := "http://model.com" 33 | e, err = ParseLine(line3) 34 | 35 | if err != nil { 36 | t.Error("cannot parse line3") 37 | } 38 | if e.Label != model.UNLABELED { 39 | t.Error("Label must be UNLABELED") 40 | } 41 | 42 | line4 := "http://model.com\t2" 43 | e, err = ParseLine(line4) 44 | 45 | if e != nil { 46 | t.Error("wrong line format") 47 | } 48 | } 49 | 50 | func TestReadExamples(t *testing.T) { 51 | filename := "../../../tech_input_example.txt" 52 | examples, err := ReadExamples(filename) 53 | 54 | if err != nil { 55 | fmt.Println(err.Error()) 56 | t.Error(fmt.Printf("Cannot read examples from %s\n", filename)) 57 | } 58 | if len(examples) == 0 { 59 | t.Error(fmt.Printf("%s should contain more than one examples\n", filename)) 60 | } 61 | } 62 | 63 | func TestWriteExamples(t *testing.T) { 64 | filename := ".write_test.txt" 65 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 66 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 67 | 68 | err := WriteExamples(model.Examples{e1, e2}, filename) 69 | if err != nil { 70 | t.Error(fmt.Printf("Cannot write examples to %s", filename)) 71 | } 72 | 73 | examples, err := ReadExamples(filename) 74 | if err != nil { 75 | t.Error(fmt.Printf("Cannot read examples from %s", filename)) 76 | } 77 | if len(examples) == 2 { 78 | t.Error(fmt.Printf("%s should contain two examples", filename)) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /lib/util/util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/syou6162/go-active-learning/lib/model" 7 | ) 8 | 9 | func FilterLabeledExamples(examples model.Examples) model.Examples { 10 | var result model.Examples 11 | for _, e := range examples { 12 | if e.IsLabeled() { 13 | result = append(result, e) 14 | } 15 | } 16 | return result 17 | } 18 | 19 | func FilterUnlabeledExamples(examples model.Examples) model.Examples { 20 | result := model.Examples{} 21 | 22 | alreadyLabeledByURL := make(map[string]bool) 23 | alreadyLabeledByTitle := make(map[string]bool) 24 | for _, e := range FilterLabeledExamples(examples) { 25 | alreadyLabeledByURL[e.Url] = true 26 | alreadyLabeledByURL[e.FinalUrl] = true 27 | alreadyLabeledByTitle[e.Title] = true 28 | } 29 | 30 | for _, e := range examples { 31 | if _, ok := alreadyLabeledByURL[e.Url]; ok { 32 | continue 33 | } 34 | if _, ok := alreadyLabeledByTitle[e.Title]; ok { 35 | continue 36 | } 37 | if !e.IsLabeled() { 38 | alreadyLabeledByURL[e.Url] = true 39 | alreadyLabeledByURL[e.FinalUrl] = true 40 | alreadyLabeledByTitle[e.Title] = true 41 | result = append(result, e) 42 | } 43 | } 44 | return result 45 | } 46 | 47 | func RemoveDuplicate(args []string) []string { 48 | results := make([]string, 0) 49 | encountered := map[string]bool{} 50 | for i := 0; i < len(args); i++ { 51 | if !encountered[args[i]] { 52 | encountered[args[i]] = true 53 | results = append(results, args[i]) 54 | } 55 | } 56 | return results 57 | } 58 | 59 | func FilterStatusCodeOkExamples(examples model.Examples) model.Examples { 60 | result := model.Examples{} 61 | 62 | for _, e := range examples { 63 | if e.StatusCode == 200 { 64 | result = append(result, e) 65 | } 66 | } 67 | 68 | return result 69 | } 70 | 71 | func FilterStatusCodeNotOkExamples(examples model.Examples) model.Examples { 72 | result := model.Examples{} 73 | 74 | for _, e := range examples { 75 | if e.StatusCode != 200 { 76 | result = append(result, e) 77 | } 78 | } 79 | 80 | return result 81 | } 82 | 83 | func RemoveExample(examples model.Examples, toBeRemoved model.Example) model.Examples { 84 | result := model.Examples{} 85 | 86 | for _, e := range examples { 87 | if e.Url != toBeRemoved.Url { 88 | result = append(result, e) 89 | } 90 | } 91 | 92 | return result 93 | } 94 | 95 | func RemoveNegativeExamples(examples model.Examples) model.Examples { 96 | result := model.Examples{} 97 | for _, e := range examples { 98 | if e.Label != model.NEGATIVE { 99 | result = append(result, e) 100 | } 101 | } 102 | return result 103 | } 104 | 105 | func UniqueByFinalUrl(examples model.Examples) model.Examples { 106 | result := model.Examples{} 107 | m := make(map[string]bool) 108 | for _, e := range examples { 109 | if !m[e.FinalUrl] { 110 | m[e.FinalUrl] = true 111 | result = append(result, e) 112 | } 113 | } 114 | return result 115 | } 116 | 117 | func UniqueByTitle(examples model.Examples) model.Examples { 118 | result := model.Examples{} 119 | m := make(map[string]bool) 120 | for _, e := range examples { 121 | if !m[e.Title] { 122 | m[e.Title] = true 123 | result = append(result, e) 124 | } 125 | } 126 | return result 127 | } 128 | 129 | func GetEnv(key, fallback string) string { 130 | value, ok := os.LookupEnv(key) 131 | if !ok { 132 | value = fallback 133 | } 134 | return value 135 | } 136 | -------------------------------------------------------------------------------- /lib/util/util_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/syou6162/go-active-learning/lib/example" 7 | "github.com/syou6162/go-active-learning/lib/model" 8 | ) 9 | 10 | func TestFilterLabeledExamples(t *testing.T) { 11 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 12 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 13 | e3 := example.NewExample("http://google.com", model.UNLABELED) 14 | 15 | examples := FilterLabeledExamples(model.Examples{e1, e2, e3}) 16 | if len(examples) != 2 { 17 | t.Error("Number of labeled examples should be 2") 18 | } 19 | } 20 | 21 | func TestFilterUnlabeledExamples(t *testing.T) { 22 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 23 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 24 | e3 := example.NewExample("http://google.com", model.UNLABELED) 25 | e3.Title = "Google" 26 | 27 | examples := FilterUnlabeledExamples(model.Examples{e1, e2, e3}) 28 | if len(examples) != 1 { 29 | t.Error("Number of unlabeled examples should be 1") 30 | } 31 | } 32 | 33 | func TestFilterStatusCodeOkExamples(t *testing.T) { 34 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE) 35 | e1.StatusCode = 200 36 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE) 37 | e2.StatusCode = 404 38 | e3 := example.NewExample("http://google.com", model.UNLABELED) 39 | e3.StatusCode = 304 40 | 41 | examples := FilterStatusCodeOkExamples(model.Examples{e1, e2, e3}) 42 | if len(examples) != 1 { 43 | t.Error("Number of examples (status code = 200) should be 1") 44 | } 45 | } 46 | 47 | func TestUniqueByFinalUrl(t *testing.T) { 48 | e1 := model.Example{FinalUrl: "aaa"} 49 | e2 := model.Example{FinalUrl: "bbb"} 50 | e3 := model.Example{FinalUrl: "aaa"} 51 | examples := model.Examples{&e1, &e2, &e3} 52 | result := UniqueByFinalUrl(examples) 53 | if len(result) != 2 { 54 | t.Errorf("length(result) should be %d, but %d", 2, len(result)) 55 | } 56 | } 57 | 58 | func TestRemoveDuplicate(t *testing.T) { 59 | args := []string{"hoge", "fuga", "piyo", "hoge"} 60 | 61 | result := RemoveDuplicate(args) 62 | if len(result) != 3 { 63 | t.Error("Number of unique string in args should be 3") 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/syou6162/go-active-learning/lib/command" 8 | "github.com/urfave/cli" 9 | ) 10 | 11 | func main() { 12 | app := cli.NewApp() 13 | app.Name = "go-active-learning" 14 | app.Commands = command.Commands 15 | 16 | if err := app.Run(os.Args); err != nil { 17 | fmt.Fprintln(os.Stderr, err) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /migrations/0.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS example ( 3 | "id" SERIAL NOT NULL PRIMARY KEY, 4 | "url" TEXT NOT NULL, 5 | "label" INT NOT NULL, 6 | "created_at" timestamp NOT NULL, 7 | "updated_at" timestamp NOT NULL 8 | ); 9 | 10 | CREATE UNIQUE INDEX IF NOT EXISTS "url_idx_example" ON example ("url"); 11 | CREATE INDEX IF NOT EXISTS "label_updated_at_idx_example" ON example ("label", "updated_at" DESC); 12 | 13 | -- +migrate Down 14 | DROP INDEX "url_idx_example"; 15 | DROP INDEX "label_updated_at_idx_example"; 16 | 17 | DROP TABLE example; 18 | -------------------------------------------------------------------------------- /migrations/1.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "example" ADD COLUMN "final_url" TEXT DEFAULT '' NOT NULL; 3 | UPDATE "example" SET "final_url" = "url"; 4 | ALTER TABLE "example" ALTER COLUMN "final_url" DROP DEFAULT; 5 | 6 | ALTER TABLE "example" ADD COLUMN "title" TEXT; 7 | ALTER TABLE "example" ADD COLUMN "description" TEXT; 8 | ALTER TABLE "example" ADD COLUMN "og_description" TEXT; 9 | ALTER TABLE "example" ADD COLUMN "og_type" TEXT; 10 | ALTER TABLE "example" ADD COLUMN "og_image" TEXT; 11 | ALTER TABLE "example" ADD COLUMN "body" TEXT; 12 | ALTER TABLE "example" ADD COLUMN "score" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 13 | ALTER TABLE "example" ADD COLUMN "is_new" BOOLEAN DEFAULT FALSE NOT NULL; 14 | ALTER TABLE "example" ADD COLUMN "status_code" INT DEFAULT 0 NOT NULL; 15 | ALTER TABLE "example" ADD COLUMN "favicon" TEXT; 16 | 17 | CREATE UNIQUE INDEX IF NOT EXISTS "final_url_idx_example" ON example ("final_url"); 18 | 19 | -- +migrate Down 20 | DROP INDEX "final_url_idx_example"; 21 | 22 | ALTER TABLE "example" DROP COLUMN "final_url"; 23 | ALTER TABLE "example" DROP COLUMN "title"; 24 | ALTER TABLE "example" DROP COLUMN "description"; 25 | ALTER TABLE "example" DROP COLUMN "og_description"; 26 | ALTER TABLE "example" DROP COLUMN "og_type"; 27 | ALTER TABLE "example" DROP COLUMN "og_image"; 28 | ALTER TABLE "example" DROP COLUMN "body"; 29 | ALTER TABLE "example" DROP COLUMN "score"; 30 | ALTER TABLE "example" DROP COLUMN "is_new"; 31 | ALTER TABLE "example" DROP COLUMN "status_code"; 32 | ALTER TABLE "example" DROP COLUMN "favicon"; 33 | -------------------------------------------------------------------------------- /migrations/10.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS recommendation ( 3 | "list_type" INT NOT NULL, 4 | "example_id" SERIAL NOT NULL, 5 | CONSTRAINT recommendation_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE 6 | ); 7 | 8 | CREATE INDEX IF NOT EXISTS "list_type_idx_recommendation" ON recommendation ("list_type"); 9 | 10 | -- +migrate Down 11 | DROP INDEX "list_type_idx_recommendation"; 12 | 13 | DROP TABLE recommendation; 14 | -------------------------------------------------------------------------------- /migrations/11.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "example" ADD COLUMN "error_count" INT NOT NULL DEFAULT 0; 3 | 4 | -- +migrate Down 5 | ALTER TABLE "example" DROP COLUMN "error_count"; 6 | -------------------------------------------------------------------------------- /migrations/12.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "tweet" ADD COLUMN "label" INT NOT NULL DEFAULT 0; 3 | 4 | -- +migrate Down 5 | ALTER TABLE "tweet" DROP COLUMN "label"; 6 | -------------------------------------------------------------------------------- /migrations/13.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "model" ADD COLUMN "model_type" INT NOT NULL DEFAULT 0; 3 | ALTER TABLE "model" ADD COLUMN "c" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 4 | ALTER TABLE "model" ADD COLUMN "accuracy" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 5 | ALTER TABLE "model" ADD COLUMN "precision" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 6 | ALTER TABLE "model" ADD COLUMN "recall" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 7 | ALTER TABLE "model" ADD COLUMN "fvalue" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 8 | 9 | DROP INDEX "created_at_model"; 10 | CREATE INDEX IF NOT EXISTS "model_type_created_at_model" ON model ("model_type", "created_at"); 11 | 12 | -- +migrate Down 13 | DROP INDEX "model_type_created_at_model"; 14 | 15 | ALTER TABLE "model" DROP COLUMN "model_type"; 16 | ALTER TABLE "model" DROP COLUMN "c"; 17 | ALTER TABLE "model" DROP COLUMN "accuracy"; 18 | ALTER TABLE "model" DROP COLUMN "precision"; 19 | ALTER TABLE "model" DROP COLUMN "recall"; 20 | ALTER TABLE "model" DROP COLUMN "fvalue"; 21 | 22 | CREATE INDEX IF NOT EXISTS "created_at_model" ON model ("created_at"); 23 | -------------------------------------------------------------------------------- /migrations/14.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "tweet" ADD COLUMN "score" DOUBLE PRECISION DEFAULT 0.0 NOT NULL; 3 | 4 | -- +migrate Down 5 | ALTER TABLE "tweet" DROP COLUMN "score"; 6 | -------------------------------------------------------------------------------- /migrations/15.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS related_example ( 3 | "example_id" SERIAL NOT NULL, 4 | "related_example_id" SERIAL NOT NULL, 5 | CONSTRAINT related_example_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE, 6 | CONSTRAINT related_example_related_example_id_fkey FOREIGN KEY ("related_example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE, 7 | CHECK(example_id != related_example_id) 8 | ); 9 | 10 | CREATE INDEX IF NOT EXISTS "example_id_idx_related_example" ON related_example ("example_id"); 11 | 12 | -- +migrate Down 13 | DROP INDEX "example_id_idx_related_example"; 14 | 15 | DROP TABLE related_example; 16 | -------------------------------------------------------------------------------- /migrations/16.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS top_accessed_example ( 3 | "example_id" SERIAL NOT NULL, 4 | CONSTRAINT top_accessed_example_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE 5 | ); 6 | 7 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_idx_top_accessed_example" ON top_accessed_example ("example_id"); 8 | 9 | -- +migrate Down 10 | DROP INDEX "example_id_idx_top_accessed_example"; 11 | 12 | DROP TABLE top_accessed_example; 13 | -------------------------------------------------------------------------------- /migrations/2.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS feature ( 3 | "example_id" SERIAL NOT NULL, 4 | "feature" TEXT NOT NULL, 5 | CONSTRAINT feature_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE 6 | ); 7 | 8 | CREATE INDEX IF NOT EXISTS "example_id_idx_example" ON feature ("example_id"); 9 | 10 | -- +migrate Down 11 | DROP INDEX "example_id_idx_example"; 12 | DROP TABLE feature; 13 | -------------------------------------------------------------------------------- /migrations/3.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "example" ALTER COLUMN "title" SET DEFAULT ''; 3 | ALTER TABLE "example" ALTER COLUMN "description" SET DEFAULT ''; 4 | ALTER TABLE "example" ALTER COLUMN "og_description" SET DEFAULT ''; 5 | ALTER TABLE "example" ALTER COLUMN "og_type" SET DEFAULT ''; 6 | ALTER TABLE "example" ALTER COLUMN "og_image" SET DEFAULT ''; 7 | ALTER TABLE "example" ALTER COLUMN "body" SET DEFAULT ''; 8 | ALTER TABLE "example" ALTER COLUMN "favicon" SET DEFAULT ''; 9 | 10 | -- +migrate Down 11 | ALTER TABLE "example" ALTER COLUMN "title" DROP DEFAULT; 12 | ALTER TABLE "example" ALTER COLUMN "description" DROP DEFAULT; 13 | ALTER TABLE "example" ALTER COLUMN "og_description" DROP DEFAULT; 14 | ALTER TABLE "example" ALTER COLUMN "og_type" DROP DEFAULT; 15 | ALTER TABLE "example" ALTER COLUMN "og_image" DROP DEFAULT; 16 | ALTER TABLE "example" ALTER COLUMN "body" DROP DEFAULT; 17 | ALTER TABLE "example" ALTER COLUMN "favicon" DROP DEFAULT; 18 | -------------------------------------------------------------------------------- /migrations/4.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | DROP INDEX "final_url_idx_example"; 3 | 4 | -- +migrate Down 5 | CREATE UNIQUE INDEX IF NOT EXISTS "final_url_idx_example" ON example ("final_url"); 6 | -------------------------------------------------------------------------------- /migrations/5.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS hatena_bookmark ( 3 | "id" SERIAL NOT NULL PRIMARY KEY, 4 | "example_id" SERIAL NOT NULL, 5 | "title" TEXT NOT NULL, 6 | "screenshot" TEXT NOT NULL, 7 | "entry_url" TEXT NOT NULL, 8 | "count" INT NOT NULL, 9 | "url" TEXT NOT NULL, 10 | "eid" TEXT NOT NULL, 11 | CONSTRAINT hatena_bookmark_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE 12 | ); 13 | 14 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_idx_hatena_bookmark" ON hatena_bookmark ("example_id"); 15 | CREATE UNIQUE INDEX IF NOT EXISTS "url_idx_hatena_bookmark" ON hatena_bookmark ("url"); 16 | 17 | CREATE TABLE IF NOT EXISTS bookmark ( 18 | "hatena_bookmark_id" SERIAL NOT NULL, 19 | "user" TEXT NOT NULL, 20 | "comment" TEXT NOT NULL, 21 | "tags" TEXT NOT NULL, 22 | "timestamp" timestamp NOT NULL, 23 | CONSTRAINT bookmark_hatena_bookmark_id_fkey FOREIGN KEY ("hatena_bookmark_id") REFERENCES hatena_bookmark("id") ON UPDATE NO ACTION ON DELETE CASCADE 24 | ); 25 | 26 | CREATE UNIQUE INDEX IF NOT EXISTS "hatena_bookmark_id_user_idx_bookmark" ON bookmark ("hatena_bookmark_id", "user"); 27 | 28 | -- +migrate Down 29 | DROP INDEX "hatena_bookmark_id_user_idx_bookmark"; 30 | DROP INDEX "example_id_idx_hatena_bookmark"; 31 | DROP INDEX "url_idx_hatena_bookmark"; 32 | 33 | DROP TABLE bookmark; 34 | DROP TABLE hatena_bookmark; 35 | -------------------------------------------------------------------------------- /migrations/6.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS tweet ( 3 | "id" SERIAL NOT NULL PRIMARY KEY, 4 | "example_id" SERIAL NOT NULL, 5 | 6 | "created_at" timestamp NOT NULL, 7 | "id_str" TEXT NOT NULL, 8 | "full_text" TEXT NOT NULL, 9 | "favorite_count" INT NOT NULL, 10 | "retweet_count" INT NOT NULL, 11 | "lang" TEXT NOT NULL, 12 | 13 | "screen_name" TEXT NOT NULL, 14 | "name" TEXT NOT NULL, 15 | "profile_image_url" TEXT NOT NULL, 16 | 17 | CONSTRAINT tweet_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE 18 | ); 19 | 20 | CREATE INDEX IF NOT EXISTS "example_id_idx_tweet" ON tweet ("example_id"); 21 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_id_str_idx_tweet" ON tweet ("example_id", "id_str"); 22 | 23 | -- +migrate Down 24 | DROP INDEX "example_id_id_str_idx_tweet"; 25 | DROP INDEX "example_id_idx_tweet"; 26 | DROP TABLE tweet; 27 | -------------------------------------------------------------------------------- /migrations/7.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "tweet" ADD COLUMN "retweeted" BOOLEAN NOT NULL DEFAULT false; 3 | 4 | -- +migrate Down 5 | ALTER TABLE "tweet" DROP COLUMN "retweeted"; 6 | -------------------------------------------------------------------------------- /migrations/8.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | ALTER TABLE "tweet" DROP COLUMN "retweeted"; 3 | 4 | -- +migrate Down 5 | ALTER TABLE "tweet" ADD COLUMN "retweeted" BOOLEAN NOT NULL DEFAULT false; 6 | -------------------------------------------------------------------------------- /migrations/9.sql: -------------------------------------------------------------------------------- 1 | -- +migrate Up 2 | CREATE TABLE IF NOT EXISTS model ( 3 | "model" TEXT NOT NULL, 4 | "created_at" timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP 5 | ); 6 | 7 | CREATE INDEX IF NOT EXISTS "created_at_model" ON model ("created_at"); 8 | 9 | -- +migrate Down 10 | DROP INDEX "created_at_model"; 11 | 12 | DROP TABLE model; 13 | -------------------------------------------------------------------------------- /script/create_database.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE "go-active-learning"; 2 | CREATE DATABASE "go-active-learning-test"; 3 | 4 | CREATE ROLE "nobody" WITH PASSWORD 'nobody' LOGIN; 5 | 6 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO nobody; 7 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO nobody; 8 | -------------------------------------------------------------------------------- /tech_input_example.txt: -------------------------------------------------------------------------------- 1 | https://www.yasuhisay.info/ 1 2 | https://songmu.jp/riji/ 1 3 | https://blog.yuuk.io/ 1 4 | https://hakobe932.hatenablog.com/ 1 5 | https://motemen.hatenablog.com/ 1 6 | https://www3.nhk.or.jp/news/ -1 7 | https://www.facebook.com/ -1 8 | http://r.gnavi.co.jp/g-interview/ -1 9 | https://suumo.jp/town/ -1 10 | https://srdk.rakuten.jp/ 11 | https://kuenishi.hatenadiary.jp/entry/2017/05/25/005527 12 | http://otiai10.hatenablog.com/entry/2017/05/24/163701 13 | https://www.yasuhisay.info/entry/2016/11/23/000000 14 | https://www.yasuhisay.info/entry/20090516/1242480413 15 | https://www.yasuhisay.info/entry/2017/05/18/080000 16 | https://arxiv.org/abs/1906.03776 1 17 | https://tech-blog.optim.co.jp/entry/2019/07/04/173000 1 18 | http://www.ai-gakkai.or.jp/my-bookmark_vol34-no4/ 1 19 | https://data.gunosy.io/entry/poincare_embedding_for_recommendations 1 20 | http://englishforhackers.com/ -1 21 | https://www.youtube.com/watch?v=5ZwknHMf1yo -1 22 | https://speakerdeck.com/livesense/shi-ye-heng-duan-zu-zhi-defalsemlsisutemukai-fa-yun-yong-toji-pan-she-ji 1 23 | https://www.yasuhisay.info/entry/splatoon2_udemae_x -1 24 | https://www.yasuhisay.info/entry/2018/11/13/090000 -1 25 | https://www.yasuhisay.info/entry/2016/09/26/080000 -1 26 | https://www.yasuhisay.info/entry/2016/03/27/215344 -1 27 | https://www.yasuhisay.info/entry/20110714/1310622171 -1 28 | https://anond.hatelabo.jp/20190713043218 -1 29 | https://cybozushiki.cybozu.co.jp/articles/m005412.html -1 30 | https://sorazine.soracom.jp/entry/2019/07/12/celestehair -1 31 | https://www.yasuhisay.info/entry/20090516/1242480413 1 32 | https://www.yasuhisay.info/entry/kaggle_avazu_ctr_prediction 1 33 | https://www.yasuhisay.info/entry/mlct_mackerel_anomaly_detection 1 34 | https://www.yasuhisay.info/entry/2018/10/04/201000 1 35 | https://honz.jp/articles/-/45278 -1 36 | https://www.megamouth.info/entry/2019/07/12/175250 -1 37 | https://www.lifehacker.jp/2019/07/193679_higedanshaku.html -1 38 | https://teineini.net/20190711-evernote-dokusyonote/ -1 39 | https://dev.classmethod.jp/tool/be-vimmer-by-trainings/ 1 40 | https://www.clear-code.com/blog/2019/7/12.html 1 41 | https://techlife.cookpad.com/entry/2019/07/13/055601 1 42 | https://future-architect.github.io/articles/20190713/ 1 43 | https://blog.craftz.dog/my-dev-workflow-using-tmux-vim-48f73cc4f39e 1 44 | https://junkyard.song.mu/slides/gocon2019-fukuoka/ 1 45 | https://nykergoto.hatenablog.jp/entry/2019/07/09/FFT_を使った時系列データ解析 1 46 | http://memorability.csail.mit.edu/index.html 1 47 | https://ynd.co/blog/tensorflow-vs-pytorch/ 1 48 | https://cloudplatform-jp.googleblog.com/2019/07/analyze-bigquery-data-with-kaggle-kernels-notebooks.html 1 49 | https://omedstu.jimdo.com/2019/07/05/force法によるrecurrent-spiking-neural-networksの教師あり学習/ 1 50 | https://buildersbox.corp-sansan.com/entry/2019/07/12/110000 1 51 | https://www.slideshare.net/shunsukekozawa5/gunosy-152302982 1 52 | https://ml-loft.connpass.com/event/136426/ 1 53 | https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/ 1 54 | https://blog.ml.cmu.edu/2019/08/02/regret-circuits-composability-of-regret-minimizers/ 1 55 | https://tech-blog.abeja.asia/entry/noisy-label-ml-survey 1 56 | https://imas.connpass.com/event/140272/ -1 57 | http://alissak.hatenablog.com/entry/2019/08/03/170413 -1 58 | https://ai.facebook.com/blog/advances-in-conversational-ai/ 1 59 | https://heartbeat.fritz.ai/a-2019-guide-to-semantic-segmentation-ca8242f5a7fc 1 60 | http://morningproject.hateblo.jp/entry/2019/08/03/112605 -1 61 | https://gendai.ismedia.jp/articles/-/66255 -1 62 | https://oreno-yuigon.hatenablog.com/entry/2019/08/02/143028 -1 63 | https://toyokeizai.net/articles/-/295714 -1 64 | https://www.jigowatt121.com/entry/2019/08/03/183756 -1 65 | https://shogi.zukeran.org/2019/07/31/konsen-1/ -1 66 | https://note.mu/futashika/n/n382a4780b8bd -1 67 | https://www.hotpepper.jp/mesitsu/entry/hiro-watanabe/19-00148 -1 68 | https://srdk.rakuten.jp/entry/2019/08/02/103000 -1 69 | https://blog.hatenablog.com/entry/2019/07/26/180000 -1 70 | https://www.huffingtonpost.jp/entry/oomura-conference_jp_5d454be5e4b0aca3411e2fe0 -1 71 | https://togetter.com/li/1383361 -1 72 | https://biz-journal.jp/2019/08/post_112573.html -1 73 | https://www.mofumofu.pink/entry/2019/08/03/144340 -1 74 | https://nlab.itmedia.co.jp/nl/articles/1908/03/news013.html -1 75 | https://behappy.pink/fedelini-alla-puttanesca/ -1 76 | https://helloandgoodbyecraft.com/jokes -1 77 | https://toyokeizai.net/articles/-/295293 -1 78 | https://sirabee.com/2019/07/31/20162134165/ -1 79 | https://www.around50-konkatsu.info/entry/2019/08/03/南の島へ現実逃避 -1 80 | https://www7.ikutanpapa.com/entry/taketei -1 81 | https://datarobot.connpass.com/event/209149/ 1 82 | https://www.mediatechnology.jp/entry/2021/03/31/160000 1 83 | https://openreview.net/forum?id=IrM64DGB21 1 84 | https://github.com/intel-isl/DPT 1 85 | https://blog.amedama.jp/entry/lgbm-data-size-vs-best-iters 1 86 | https://recruit.gmo.jp/engineer/jisedai/blog/vision_transformer/ 1 87 | https://data.gunosy.io/entry/deim2021 1 88 | https://logmi.jp/tech/articles/324141 1 89 | https://qiita.com/jovyan/items/c41ab61a6b04e9a6e4df 1 90 | https://github.com/manujosephv/pytorch_tabular 1 91 | https://tech.retrieva.jp/entry/2021/04/01/114943 1 92 | https://memo.sugyan.com/entry/2021/04/02/005434 1 93 | https://rooftop.cc/news/2021/03/31160000.php -1 94 | https://www.tokio.inc/s/tokio/ -1 95 | https://firego8.com/fire%e3%81%97%e3%81%be%e3%81%97%e3%81%9f%ef%bc%81 -1 96 | https://comic-days.com/episode/3269632237302594670 -1 97 | https://www.youtube.com/watch?v=oGEUZuicEYM -1 98 | https://www.youtube.com/watch?v=Cs_l0LIhg5M -1 99 | https://nlab.itmedia.co.jp/nl/articles/2104/01/news105.html -1 100 | https://animeanime.jp/article/2021/04/02/60531.html -1 101 | https://ja.kohsuke.org/%E3%82%BD%E3%83%95%E3%83%88%E3%82%A6%E3%82%A7%E3%82%A2%E9%96%8B%E7%99%BA/%E5%84%AA%E7%A7%80%E3%81%95%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6/ -1 102 | https://togetter.com/li/1691643 -1 103 | https://www.yasuhisay.info/entry/2021/02/12/090000 1 104 | https://www.yasuhisay.info/entry/2021/02/25/130000 1 105 | https://www.yasuhisay.info/entry/2021/03/12/114500 1 106 | https://www.yasuhisay.info/entry/2021/03/24/090000 1 107 | https://www.yasuhisay.info/entry/2021/03/25/083000 1 108 | https://www.yasuhisay.info/entry/2021/03/28/143000 1 109 | https://wapa5pow.com/posts/2021-03-31--day-one-in-project 1 110 | https://aws.amazon.com/jp/blogs/startup/tech-case-study-jp-startup-ai-ml/ 1 111 | https://nhiroki.jp/2021/03/31/design-docs 1 112 | https://dev.classmethod.jp/articles/ways-to-check-fargate-cpu-usage/ 1 113 | https://dev.classmethod.jp/articles/amazon-route-53-resolver-dns-firewall/ 1 114 | https://scrapbox.io/mizdra/chrome_devtools_%E3%81%AE_tips_N%E9%80%A3%E7%99%BA 1 115 | https://zenn.dev/saboyutaka/articles/07f1351a6b0049 1 116 | --------------------------------------------------------------------------------