├── .github
├── dependabot.yml
└── workflows
│ └── build.yaml
├── LICENSE
├── Makefile
├── README.md
├── dbconfig.yml
├── go.mod
├── go.sum
├── lib
├── add
│ ├── add.go
│ └── add_test.go
├── annotation
│ ├── annotation.go
│ ├── annotation_cli.go
│ └── annotation_slack.go
├── classifier
│ ├── mira.go
│ └── mira_test.go
├── command
│ └── command.go
├── diagnosis
│ ├── diagnosis.go
│ ├── feature_weight
│ │ ├── feature_weight.go
│ │ └── feature_weight_test.go
│ └── label_conflict
│ │ ├── label_conflict.go
│ │ └── label_conflict_test.go
├── evaluation
│ ├── evaluation.go
│ └── evaluation_test.go
├── example
│ └── example.go
├── feature
│ ├── example
│ │ ├── example.go
│ │ └── example_test.go
│ ├── feature.go
│ └── tweet
│ │ ├── tweet.go
│ │ └── tweet_test.go
├── fetcher
│ ├── fetcher.go
│ └── fetcher_test.go
├── hatena_bookmark
│ ├── hatena_bookmark.go
│ └── hatena_bookmark_test.go
├── model
│ ├── error.go
│ ├── example.go
│ ├── hatena_bookmark.go
│ ├── label_type.go
│ ├── recommendation.go
│ ├── related_example.go
│ └── tweet.go
├── related_example
│ └── related_example.go
├── repository
│ ├── example.go
│ ├── example_test.go
│ ├── hatena_bookmark.go
│ ├── hatena_bookmark_test.go
│ ├── mira.go
│ ├── mira_test.go
│ ├── recommendation.go
│ ├── recommendation_test.go
│ ├── related_example.go
│ ├── related_example_test.go
│ ├── repository.go
│ ├── top_accessed_example.go
│ ├── top_accessed_example_test.go
│ ├── tweet.go
│ └── tweet_test.go
├── service
│ ├── example.go
│ ├── example_test.go
│ └── service.go
├── top_accessed_example
│ └── top_accessed_example.go
└── util
│ ├── converter
│ └── converter.go
│ ├── file
│ ├── file.go
│ └── file_test.go
│ ├── util.go
│ └── util_test.go
├── main.go
├── migrations
├── 0.sql
├── 1.sql
├── 10.sql
├── 11.sql
├── 12.sql
├── 13.sql
├── 14.sql
├── 15.sql
├── 16.sql
├── 2.sql
├── 3.sql
├── 4.sql
├── 5.sql
├── 6.sql
├── 7.sql
├── 8.sql
└── 9.sql
├── script
└── create_database.sql
└── tech_input_example.txt
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: gomod
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | time: "20:00"
8 | open-pull-requests-limit: 10
9 | reviewers:
10 | - syou6162
11 |
--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
1 | name: build and test
2 | on: [push]
3 |
4 | jobs:
5 | build:
6 | name: build and test
7 | runs-on: ubuntu-latest
8 | services:
9 | postgres:
10 | image: postgres:9.6
11 | env:
12 | POSTGRES_USER: nobody
13 | POSTGRES_PASSWORD: nobody
14 | POSTGRES_DB: go-active-learning-test
15 | ports:
16 | - 5432:5432
17 | options: >-
18 | --health-cmd pg_isready
19 | --health-interval 10s
20 | --health-timeout 5s
21 | --health-retries 5
22 | --name postgres
23 | steps:
24 | - name: checkout
25 | uses: actions/checkout@v2
26 | - name: format
27 | run: test `gofmt -l $(git ls-files | grep -e '\.go$' | grep -v -e vendor) | wc -l` = 0
28 | - name: deps
29 | run: make deps
30 | - name: build
31 | run: make build
32 | - name: test
33 | run: |
34 | export GOPATH=$HOME/go
35 | export GOBIN=$(go env GOPATH)/bin
36 | export PATH=$PATH:$GOPATH
37 | export PATH=$PATH:$GOBIN
38 | sql-migrate up -env=test
39 | make cover
40 | goveralls -coverprofile=${COVERAGE} -service=circle-ci -repotoken=${{ secrets.COVERALLS_TOKEN }}
41 | env:
42 | POSTGRES_HOST: localhost
43 | POSTGRES_PORT: 5432
44 | POSTGRES_USER: nobody
45 | POSTGRES_PASSWORD: nobody
46 | COVERAGE: coverage.out
47 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Yasuhisa Yoshida
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | COVERAGE = coverage.out
2 | export GO111MODULE := on
3 |
4 | all: build
5 |
6 | .PHONY: deps
7 | deps:
8 | go mod download
9 | go get github.com/mattn/goveralls
10 | go get github.com/haya14busa/goverage
11 | go get github.com/rubenv/sql-migrate/sql-migrate
12 |
13 | .PHONY: build
14 | build:
15 | go build -v
16 |
17 | .PHONY: fmt
18 | fmt:
19 | gofmt -s -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor)
20 | goimports -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor)
21 |
22 | .PHONY: test
23 | test:
24 | DB_NAME=go-active-learning-test go test -v ./... -p 1 -count 1
25 |
26 | .PHONY: vet
27 | vet:
28 | go tool vet --all *.go
29 |
30 | .PHONY: test-all
31 | test-all: vet test
32 |
33 | .PHONY: cover
34 | cover:
35 | DB_NAME=go-active-learning-test goverage -parallel 1 -v -coverprofile=${COVERAGE} ./...
36 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # go-active-learning
2 | [](https://circleci.com/gh/syou6162/go-active-learning)
3 | [](https://goreportcard.com/report/github.com/syou6162/go-active-learning)
4 | [](https://coveralls.io/github/syou6162/go-active-learning?branch=master)
5 |
6 | go-active-learning is a command line annotation tool for binary classification problem written in Go. It uses simple active learning algorithm to reduce annotation time.
7 |
8 | # Install
9 |
10 | ```console
11 | % go get github.com/syou6162/go-active-learning
12 | ```
13 |
14 | ## Build from source
15 |
16 | ```console
17 | % git clone https://github.com/syou6162/go-active-learning.git
18 | % cd go-active-learning
19 | % createdb go-active-learning
20 | % createdb go-active-learning-test
21 | % sql-migrate up -env=local
22 | % sql-migrate up -env=test
23 | % make build
24 | ```
25 |
26 | # Usage
27 | go-active-learning has `annotate` (annotate new examples suggested by active learning) mode and `diagnose` (check label conflicts in training data) mode. To see the detail options, type `./go-active-learning --help`.
28 |
29 | ## Annotation model
30 | To see the detail options, type `./go-active-learning annotate --help`.
31 |
32 | ## Annotate new examples from command line interface
33 | To see the detail options, type `./go-active-learning annotate cli --help`.
34 |
35 | ```console
36 | % ./go-active-learning annotate cli --open-url
37 | Loading cache...
38 | Label this example (Score: 0.600): http://srdk.rakuten.jp/ (それどこ)
39 |
40 | p: Label this example as positive.
41 | n: Label this example as negative.
42 | s: Skip this example.
43 | h: Show this help.
44 | e: Exit.
45 |
46 | Label this example (Score: 1.000): http://srdk.rakuten.jp/ (それどこ)
47 | Labeled as negative
48 | ```
49 |
50 | ## Annotate new examples from slack
51 | To see the detail options, type `./go-active-learning annotate cli --help`. To annotate new examples from slack, you need to create slack bot, and obtain token from [here](https://my.slack.com/services/new/bot). You can pass token via environmental variable (`SLACK_TOKEN`).
52 |
53 | ```console
54 | % export SLACK_TOKEN=xoxb-SLACK-TOKEN
55 | % ./go-active-learning annotate slack --filter-status-code-ok --channel CHANNEL_ID
56 | ```
57 |
58 | ## Diagnosis model
59 | To see the detail options, type `./go-active-learning diagnose --help`.
60 |
61 | ### Diagnose training data
62 | This subcommand diagnoses label conflicts in training data. 'conflict' means that an annotated label is '-1/1', but a predicted label by model is '1/-1'. In the above example, `http://www3.nhk.or.jp/news/` is a conflict case ('Label' is -1, but 'Score' is positive). You may need to collect such news articles to train a good classifier.
63 |
64 | ```console
65 | % ./go-active-learning diagnose label-conflict
66 | Loading cache...
67 | Index Label Score URL Title
68 | 0 -1 0.491 http://www3.nhk.or.jp/news/
69 | 1 1 0.491 http://blog.yuuk.io/
70 | 2 1 0.491 http://www.yasuhisay.info/
71 | 3 -1 -3.057 http://r.gnavi.co.jp/g-interview/ ぐるなび みんなのごはん
72 | 4 1 4.264 http://hakobe932.hatenablog.com/ hakobe-blog ♨
73 | 5 -1 -7.151 http://suumo.jp/town/ SUUMOタウン
74 | 6 -1 -26.321 https://www.facebook.com/ ログイン (日本語)
75 | 7 1 44.642 http://www.songmu.jp/riji/ おそらくはそれさえも平凡な日々
76 | 8 1 121.170 http://motemen.hatenablog.com/ 詩と創作・思索のひろば
77 | Saving cache...
78 | ```
79 |
80 | ### Diagnose feature weight
81 | This subcommand list pairs of feature weight and its name.
82 |
83 | ```console
84 | % ./go-active-learning diagnose feature-weight --filter-status-code-ok | head -n 10
85 | +0.80 BODY:/
86 | +0.80 BODY:ほか
87 | +0.80 BODY:郁
88 | +0.80 BODY:単行本
89 | +0.80 BODY:姿
90 | +0.80 BODY:暗黙
91 | +0.80 BODY:創造
92 | +0.80 BODY:企業
93 | +0.80 BODY:野中
94 | +0.80 BODY:準備
95 | ```
96 |
97 | # Author
98 | Yasuhisa Yoshida
99 |
--------------------------------------------------------------------------------
/dbconfig.yml:
--------------------------------------------------------------------------------
1 | test:
2 | dialect: postgres
3 | datasource: host=localhost user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} dbname=go-active-learning-test sslmode=disable
4 | dir: migrations
5 |
6 | local:
7 | dialect: postgres
8 | datasource: host=localhost user=nobody password=nobody dbname=go-active-learning sslmode=disable
9 | dir: migrations
10 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/syou6162/go-active-learning
2 |
3 | go 1.12
4 |
5 | require (
6 | github.com/PuerkitoBio/goquery v1.5.1
7 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 // indirect
8 | github.com/fatih/set v0.2.1 // indirect
9 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect
10 | github.com/gorilla/websocket v1.4.0 // indirect
11 | github.com/ikawaha/kagome v1.11.2
12 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43 // indirect
13 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f
14 | github.com/jmoiron/sqlx v1.3.1
15 | github.com/lib/pq v1.10.0
16 | github.com/mackerelio/mackerel-client-go v0.16.0
17 | github.com/mattn/go-isatty v0.0.8 // indirect
18 | github.com/mattn/go-runewidth v0.0.4 // indirect
19 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859
20 | github.com/montanaflynn/stats v0.5.0 // indirect
21 | github.com/neurosnap/sentences v1.0.6 // indirect
22 | github.com/nlopes/slack v0.6.0
23 | github.com/olekukonko/tablewriter v0.0.1 // indirect
24 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4
25 | github.com/pkg/errors v0.9.1
26 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d // indirect
27 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
28 | github.com/stretchr/testify v1.3.0 // indirect
29 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f
30 | github.com/urfave/cli v1.22.5
31 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa // indirect
32 | golang.org/x/text v0.3.2 // indirect
33 | gopkg.in/neurosnap/sentences.v1 v1.0.6 // indirect
34 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0
35 | )
36 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
2 | github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
3 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
4 | github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
5 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
6 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 h1:c4mLfegoDw6OhSJXTd2jUEQgZUQuJWtocudb97Qn9EM=
7 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI=
8 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
9 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
10 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
11 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
12 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
13 | github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA=
14 | github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI=
15 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I=
16 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs=
17 | github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs=
18 | github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
19 | github.com/gorilla/websocket v1.2.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
20 | github.com/gorilla/websocket v1.4.0 h1:WDFjx/TMzVgy9VdMMQi2K2Emtwi2QcUQsztZ/zLaH/Q=
21 | github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
22 | github.com/ikawaha/kagome v1.11.2 h1:eCWpLqv5Euqa5JcwkaobUSy6uGM8rwwMw5Su3eRepBI=
23 | github.com/ikawaha/kagome v1.11.2/go.mod h1:lHwhkGuuWqKWTxeQMppD0EmQAfKbc39QKx9qoWqgo+A=
24 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43 h1:jTkyeF7NZ5oIr0ESmcrpiDgAfoidCBF4F5kJhjtaRwE=
25 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
26 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f h1:AQ+AwWeEFf6NsjaMzhuVKLfxZH1+i7aoHuYXObQAzDo=
27 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f/go.mod h1:jkF0lkxaX5PFSlk9l4Gh9Y+T57TqUZziWT7uZbW5ADg=
28 | github.com/jmoiron/sqlx v1.3.1 h1:aLN7YINNZ7cYOPK3QC83dbM6KT0NMqVMw961TqrejlE=
29 | github.com/jmoiron/sqlx v1.3.1/go.mod h1:2BljVx/86SuTyjE+aPYlHCTNvZrnJXghYGpNiXLBMCQ=
30 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
31 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
32 | github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
33 | github.com/lib/pq v1.10.0 h1:Zx5DJFEYQXio93kgXnQ09fXNiUKsqv4OUEu2UtGcB1E=
34 | github.com/lib/pq v1.10.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
35 | github.com/mackerelio/mackerel-client-go v0.16.0 h1:9AoqOg+kX07QsVBGN8yD3Zx0skd+cGqESp7kXquDjDs=
36 | github.com/mackerelio/mackerel-client-go v0.16.0/go.mod h1:/GNOj+y1eFsd3CK8c6IQ/uS38/GT0+NWImk5YGJs5Lk=
37 | github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE=
38 | github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
39 | github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y=
40 | github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
41 | github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
42 | github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
43 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859 h1:smQbSzmT3EHl4EUwtFwFGmGIpiYgIiiPeVv1uguIQEE=
44 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859/go.mod h1:XPvLUNfbS4fJH25nqRHfWLMa1ONC8Amw+mIA639KxkE=
45 | github.com/montanaflynn/stats v0.5.0 h1:2EkzeTSqBB4V4bJwWrt5gIIrZmpJBcoIRGS2kWLgzmk=
46 | github.com/montanaflynn/stats v0.5.0/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
47 | github.com/neurosnap/sentences v1.0.6 h1:iBVUivNtlwGkYsJblWV8GGVFmXzZzak907Ci8aA0VTE=
48 | github.com/neurosnap/sentences v1.0.6/go.mod h1:pg1IapvYpWCJJm/Etxeh0+gtMf1rI1STY9S7eUCPbDc=
49 | github.com/nlopes/slack v0.6.0 h1:jt0jxVQGhssx1Ib7naAOZEZcGdtIhTzkP0nopK0AsRA=
50 | github.com/nlopes/slack v0.6.0/go.mod h1:JzQ9m3PMAqcpeCam7UaHSuBuupz7CmpjehYMayT6YOk=
51 | github.com/olekukonko/tablewriter v0.0.1 h1:b3iUnf1v+ppJiOfNX4yxxqfWKMQPZR5yoh8urCTFX88=
52 | github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo=
53 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4 h1:49lOXmGaUpV9Fz3gd7TFZY106KVlPVa5jcYD1gaQf98=
54 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA=
55 | github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
56 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
57 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
58 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
59 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
60 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
61 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
62 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d h1:rUbV6LJa5RXK3jT/4jnJUz3UkrXzW6cqB+n9Fkbv9jY=
63 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d/go.mod h1:2htx6lmL0NGLHlO8ZCf+lQBGBHIbEujyywxJArf+2Yc=
64 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
65 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
66 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
67 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
68 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
69 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
70 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
71 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
72 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f h1:KZTZZaZYr4F+0V3AUEs2ZvOGYFlUKFdAWt+CkyhC2Wc=
73 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f/go.mod h1:T2hVrnNfCW4aQcCS7ReyHEKMEZat4F+fxMCzBlf1Q8g=
74 | github.com/urfave/cli v1.22.5 h1:lNq9sAHXK2qfdI8W+GRItjCEkI+2oR4d+MEHy1CKXoU=
75 | github.com/urfave/cli v1.22.5/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
76 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
77 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
78 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
79 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b h1:Wh+f8QHJXR411sJR8/vRBTZ7YapZaRvUcLFFJhusH0k=
80 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
81 | golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
82 | golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ=
83 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
84 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
85 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
86 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
87 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
88 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b h1:0mm1VjtFUOIlE1SbDlwjYaDxZVDP2S5ou6y0gSgXHu8=
89 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
90 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
91 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
92 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
93 | golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
94 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
95 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa h1:KIDDMLT1O0Nr7TSxp8xM5tJcdn8tgyAONntO829og1M=
96 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
97 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
98 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
99 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
100 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
101 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
102 | golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
103 | golang.org/x/tools v0.0.0-20200612022331-742c5eb664c2 h1:DVqHa33CzfnTKwUV6be+I4hp31W6iXn3ZiEcdKGzLyI=
104 | golang.org/x/tools v0.0.0-20200612022331-742c5eb664c2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
105 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
106 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
107 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
108 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
109 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
110 | gopkg.in/neurosnap/sentences.v1 v1.0.6 h1:v7ElyP020iEZQONyLld3fHILHWOPs+ntzuQTNPkul8E=
111 | gopkg.in/neurosnap/sentences.v1 v1.0.6/go.mod h1:YlK+SN+fLQZj+kY3r8DkGDhDr91+S3JmTb5LSxFRQo0=
112 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0 h1:YY+ZVPsg2oJnV1rpzwIWtuCtQk71YFwuk47mMtjraN4=
113 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0/go.mod h1:6LhSPGi1OSJsWUQZridpjQXWEnDzw7EZAXSjc5SyF8A=
114 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
115 |
--------------------------------------------------------------------------------
/lib/add/add.go:
--------------------------------------------------------------------------------
1 | package add
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "time"
7 |
8 | "os"
9 |
10 | mkr "github.com/mackerelio/mackerel-client-go"
11 | "github.com/syou6162/go-active-learning/lib/classifier"
12 | "github.com/syou6162/go-active-learning/lib/hatena_bookmark"
13 | "github.com/syou6162/go-active-learning/lib/service"
14 | "github.com/syou6162/go-active-learning/lib/util"
15 | "github.com/syou6162/go-active-learning/lib/util/file"
16 | "github.com/urfave/cli"
17 | )
18 |
19 | func doAdd(c *cli.Context) error {
20 | inputFilename := c.String("input-filename")
21 |
22 | if inputFilename == "" {
23 | _ = cli.ShowCommandHelp(c, "add")
24 | return cli.NewExitError("`input-filename` is a required field.", 1)
25 | }
26 |
27 | app, err := service.NewDefaultApp()
28 | if err != nil {
29 | return err
30 | }
31 | defer app.Close()
32 |
33 | examples, err := file.ReadExamples(inputFilename)
34 | if err != nil {
35 | return err
36 | }
37 |
38 | if err := app.AttachMetadata(examples, 0, 0); err != nil {
39 | return err
40 | }
41 |
42 | examples = util.FilterStatusCodeNotOkExamples(examples)
43 | app.Fetch(examples)
44 | examples = util.FilterStatusCodeOkExamples(examples)
45 |
46 | m, err := app.FindLatestMIRAModel(classifier.EXAMPLE)
47 | skipPredictScore := false
48 | if err != nil {
49 | log.Println(fmt.Sprintf("Error to load model %s", err.Error()))
50 | skipPredictScore = true
51 | }
52 |
53 | for _, e := range examples {
54 | if !skipPredictScore {
55 | e.Score = m.PredictScore(e.Fv)
56 | }
57 | if e.CreatedAt.Before(time.Date(2000, 01, 01, 0, 0, 0, 0, time.Local)) {
58 | log.Println(fmt.Sprintf("Skipin too old example: %s", e.Url))
59 | continue
60 | }
61 | if err = app.UpdateOrCreateExample(e); err != nil {
62 | log.Println(fmt.Sprintf("Error occured proccessing %s %s", e.Url, err.Error()))
63 | continue
64 | }
65 | if err = app.UpdateFeatureVector(e); err != nil {
66 | log.Println(fmt.Sprintf("Error occured proccessing %s feature vector %s", e.Url, err.Error()))
67 | continue
68 | }
69 | if bookmark, err := hatena_bookmark.GetHatenaBookmark(e.FinalUrl); err == nil {
70 | e.HatenaBookmark = bookmark
71 | app.UpdateHatenaBookmark(e)
72 | }
73 | }
74 |
75 | if err := postNumOfExamplesToMackerel(app); err != nil {
76 | return err
77 | }
78 |
79 | return nil
80 | }
81 |
82 | func postNumOfExamplesToMackerel(app service.GoActiveLearningApp) error {
83 | cnt, err := app.CountPositiveExamples()
84 | if err != nil {
85 | return err
86 | }
87 | if err := postNumOfExamplesByLabelToMackerel("count.positive", cnt); err != nil {
88 | return err
89 | }
90 |
91 | cnt, err = app.CountNegativeExamples()
92 | if err != nil {
93 | return err
94 | }
95 | if err := postNumOfExamplesByLabelToMackerel("count.negative", cnt); err != nil {
96 | return err
97 | }
98 |
99 | cnt, err = app.CountUnlabeledExamples()
100 | if err != nil {
101 | return err
102 | }
103 | if err := postNumOfExamplesByLabelToMackerel("count.unlabeled", cnt); err != nil {
104 | return err
105 | }
106 | return nil
107 | }
108 |
109 | func postNumOfExamplesByLabelToMackerel(label string, cnt int) error {
110 | apiKey := os.Getenv("MACKEREL_APIKEY")
111 | serviceName := os.Getenv("MACKEREL_SERVICE_NAME")
112 | if apiKey == "" || serviceName == "" {
113 | return nil
114 | }
115 |
116 | client := mkr.NewClient(apiKey)
117 | now := time.Now().Unix()
118 | err := client.PostServiceMetricValues(serviceName, []*mkr.MetricValue{
119 | {
120 | Name: label,
121 | Time: now,
122 | Value: cnt,
123 | },
124 | })
125 | return err
126 | }
127 |
128 | var CommandAdd = cli.Command{
129 | Name: "add",
130 | Usage: "add urls",
131 | Description: `
132 | Add urls.
133 | `,
134 | Action: doAdd,
135 | Flags: []cli.Flag{
136 | cli.StringFlag{Name: "input-filename"},
137 | },
138 | }
139 |
--------------------------------------------------------------------------------
/lib/add/add_test.go:
--------------------------------------------------------------------------------
1 | package add_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/command"
7 | "github.com/urfave/cli"
8 | )
9 |
10 | func TestDoAdd(t *testing.T) {
11 | app := cli.NewApp()
12 | app.Commands = command.Commands
13 | args := []string{
14 | "go-active-learning-web",
15 | "add",
16 | "--input-filename=../../tech_input_example.txt",
17 | }
18 |
19 | if err := app.Run(args); err != nil {
20 | t.Error(err)
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/lib/annotation/annotation.go:
--------------------------------------------------------------------------------
1 | package annotation
2 |
3 | import (
4 | "github.com/syou6162/go-active-learning/lib/classifier"
5 | "github.com/syou6162/go-active-learning/lib/model"
6 | "github.com/urfave/cli"
7 | )
8 |
9 | type ActionType int
10 |
11 | const (
12 | LABEL_AS_POSITIVE ActionType = iota
13 | LABEL_AS_NEGATIVE
14 | HELP
15 | SKIP
16 | EXIT
17 | )
18 |
19 | func rune2ActionType(r rune) ActionType {
20 | switch r {
21 | case 'p':
22 | return LABEL_AS_POSITIVE
23 | case 'n':
24 | return LABEL_AS_NEGATIVE
25 | case 's':
26 | return SKIP
27 | case 'h':
28 | return HELP
29 | case 'e':
30 | return EXIT
31 | default:
32 | return HELP
33 | }
34 | }
35 |
36 | func NextExampleToBeAnnotated(m classifier.MIRAClassifier, examples model.Examples) *model.Example {
37 | unlabeledExamples := m.SortByScore(examples)
38 | if len(unlabeledExamples) == 0 {
39 | return nil
40 | }
41 | e := unlabeledExamples[0]
42 | if e == nil {
43 | return nil
44 | }
45 | return e
46 | }
47 |
48 | var ActionHelpDoc = `
49 | p: Label this example as positive.
50 | n: Label this example as negative.
51 | s: Skip this example.
52 | h: Show this help.
53 | e: Exit.
54 | `
55 |
56 | var CommandAnnotate = cli.Command{
57 | Name: "annotate",
58 | Usage: "Annotate URLs",
59 | Description: `
60 | Annotate URLs using active learning.
61 | `,
62 | Subcommands: []cli.Command{
63 | {
64 | Name: "cli",
65 | Usage: "Annotate URLs using cli",
66 | Description: `
67 | Annotate URLs using active learning using cli.
68 | `,
69 | Action: doAnnotate,
70 | Flags: []cli.Flag{
71 | cli.BoolFlag{Name: "open-url", Usage: "Open url in background"},
72 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
73 | cli.BoolFlag{Name: "show-active-features"},
74 | },
75 | },
76 | {
77 | Name: "slack",
78 | Usage: "Annotate URLs using slack",
79 | Description: `
80 | Annotate URLs using active learning using slack.
81 | `,
82 | Action: doAnnotateWithSlack,
83 | Flags: []cli.Flag{
84 | cli.StringFlag{Name: "channel"},
85 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
86 | },
87 | },
88 | },
89 | }
90 |
--------------------------------------------------------------------------------
/lib/annotation/annotation_cli.go:
--------------------------------------------------------------------------------
1 | package annotation
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "math"
8 | "sort"
9 |
10 | "github.com/mattn/go-tty"
11 | "github.com/pkg/browser"
12 | "github.com/syou6162/go-active-learning/lib/classifier"
13 | "github.com/syou6162/go-active-learning/lib/example"
14 | "github.com/syou6162/go-active-learning/lib/model"
15 | "github.com/syou6162/go-active-learning/lib/service"
16 | "github.com/syou6162/go-active-learning/lib/util"
17 | "github.com/syou6162/go-active-learning/lib/util/converter"
18 | "github.com/urfave/cli"
19 | )
20 |
21 | func input2ActionType() (ActionType, error) {
22 | t, err := tty.Open()
23 | defer t.Close()
24 | if err != nil {
25 | return EXIT, err
26 | }
27 | var r rune
28 | for r == 0 {
29 | r, err = t.ReadRune()
30 | if err != nil {
31 | return HELP, err
32 | }
33 | }
34 | return rune2ActionType(r), nil
35 | }
36 |
37 | func doAnnotate(c *cli.Context) error {
38 | openUrl := c.Bool("open-url")
39 | filterStatusCodeOk := c.Bool("filter-status-code-ok")
40 | showActiveFeatures := c.Bool("show-active-features")
41 |
42 | app, err := service.NewDefaultApp()
43 | if err != nil {
44 | return err
45 | }
46 | defer app.Close()
47 |
48 | examples, err := app.SearchExamples()
49 | if err != nil {
50 | return err
51 | }
52 |
53 | stat := example.GetStat(examples)
54 | fmt.Fprintln(os.Stderr, fmt.Sprintf("Positive:%d, Negative:%d, Unlabeled:%d", stat["positive"], stat["negative"], stat["unlabeled"]))
55 |
56 | app.Fetch(examples)
57 | for _, e := range examples {
58 | app.UpdateFeatureVector(e)
59 | }
60 | if filterStatusCodeOk {
61 | examples = util.FilterStatusCodeOkExamples(examples)
62 | }
63 |
64 | m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
65 | if err != nil {
66 | return err
67 | }
68 |
69 | annotationLoop:
70 | for {
71 | e := NextExampleToBeAnnotated(*m, examples)
72 | if e == nil {
73 | fmt.Println("No example")
74 | break annotationLoop
75 | }
76 | fmt.Println("Label this example (Score: " + fmt.Sprintf("%+0.03f", e.Score) + "): " + e.Url + " (" + e.Title + ")")
77 |
78 | if openUrl {
79 | browser.OpenURL(e.Url)
80 | }
81 | if showActiveFeatures {
82 | ShowActiveFeatures(*m, *e, 5)
83 | }
84 |
85 | act, err := input2ActionType()
86 | if err != nil {
87 | return err
88 | }
89 | switch act {
90 | case LABEL_AS_POSITIVE:
91 | fmt.Println("Labeled as positive")
92 | e.Annotate(model.POSITIVE)
93 | app.UpdateOrCreateExample(e)
94 | case LABEL_AS_NEGATIVE:
95 | fmt.Println("Labeled as negative")
96 | e.Annotate(model.NEGATIVE)
97 | app.UpdateOrCreateExample(e)
98 | case SKIP:
99 | fmt.Println("Skiped this example")
100 | examples = util.RemoveExample(examples, *e)
101 | continue
102 | case HELP:
103 | fmt.Println(ActionHelpDoc)
104 | case EXIT:
105 | fmt.Println("EXIT")
106 | break annotationLoop
107 | default:
108 | break annotationLoop
109 | }
110 |
111 | m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
112 | if err != nil {
113 | return err
114 | }
115 | }
116 |
117 | return nil
118 | }
119 |
120 | type FeatureWeightPair struct {
121 | Feature string
122 | Weight float64
123 | }
124 |
125 | type FeatureWeightPairs []FeatureWeightPair
126 |
127 | func SortedActiveFeatures(model classifier.MIRAClassifier, example model.Example, n int) FeatureWeightPairs {
128 | pairs := FeatureWeightPairs{}
129 | for _, f := range example.Fv {
130 | pairs = append(pairs, FeatureWeightPair{f, model.GetWeight(f)})
131 | }
132 | sort.Sort(sort.Reverse(pairs))
133 |
134 | result := FeatureWeightPairs{}
135 | cnt := 0
136 | for _, pair := range pairs {
137 | if cnt >= n {
138 | break
139 | }
140 | if (example.Score > 0.0 && pair.Weight > 0.0) || (example.Score < 0.0 && pair.Weight < 0.0) {
141 | result = append(result, pair)
142 | cnt++
143 | }
144 | }
145 | return result
146 | }
147 |
148 | func ShowActiveFeatures(model classifier.MIRAClassifier, example model.Example, n int) {
149 | for _, pair := range SortedActiveFeatures(model, example, n) {
150 | fmt.Println(fmt.Sprintf("%+0.1f %s", pair.Weight, pair.Feature))
151 | }
152 | }
153 |
154 | func (slice FeatureWeightPairs) Len() int {
155 | return len(slice)
156 | }
157 |
158 | func (slice FeatureWeightPairs) Less(i, j int) bool {
159 | return math.Abs(slice[i].Weight) < math.Abs(slice[j].Weight)
160 | }
161 |
162 | func (slice FeatureWeightPairs) Swap(i, j int) {
163 | slice[i], slice[j] = slice[j], slice[i]
164 | }
165 |
--------------------------------------------------------------------------------
/lib/annotation/annotation_slack.go:
--------------------------------------------------------------------------------
1 | package annotation
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/nlopes/slack"
8 | "github.com/pkg/errors"
9 | "github.com/syou6162/go-active-learning/lib/classifier"
10 | "github.com/syou6162/go-active-learning/lib/example"
11 | "github.com/syou6162/go-active-learning/lib/model"
12 | "github.com/syou6162/go-active-learning/lib/service"
13 | "github.com/syou6162/go-active-learning/lib/util"
14 | "github.com/syou6162/go-active-learning/lib/util/converter"
15 | "github.com/urfave/cli"
16 | )
17 |
18 | func doAnnotateWithSlack(c *cli.Context) error {
19 | channelID := c.String("channel")
20 | filterStatusCodeOk := c.Bool("filter-status-code-ok")
21 |
22 | if channelID == "" {
23 | _ = cli.ShowCommandHelp(c, "slack")
24 | return cli.NewExitError("`channel` is a required field.", 1)
25 | }
26 |
27 | api := slack.New(os.Getenv("SLACK_TOKEN"))
28 | rtm := api.NewRTM()
29 | go rtm.ManageConnection()
30 |
31 | app, err := service.NewDefaultApp()
32 | if err != nil {
33 | return err
34 | }
35 | defer app.Close()
36 |
37 | examples, err := app.SearchExamples()
38 | if err != nil {
39 | return err
40 | }
41 |
42 | stat := example.GetStat(examples)
43 | msg := rtm.NewOutgoingMessage(fmt.Sprintf("Positive:%d, Negative:%d, Unlabeled:%d", stat["positive"], stat["negative"], stat["unlabeled"]), channelID)
44 | rtm.SendMessage(msg)
45 |
46 | app.Fetch(examples)
47 | for _, e := range examples {
48 | app.UpdateFeatureVector(e)
49 | }
50 | if filterStatusCodeOk {
51 | examples = util.FilterStatusCodeOkExamples(examples)
52 | }
53 |
54 | m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
55 | if err != nil {
56 | return err
57 | }
58 | e := NextExampleToBeAnnotated(*m, examples)
59 | if e == nil {
60 | return errors.New("No e to annotate")
61 | }
62 |
63 | rtm.SendMessage(rtm.NewOutgoingMessage("Ready to annotate!", channelID))
64 | showExample(rtm, *m, e, channelID)
65 | prevTimestamp := ""
66 |
67 | annotationLoop:
68 | for {
69 | select {
70 | case msg := <-rtm.IncomingEvents:
71 | switch ev := msg.Data.(type) {
72 | case *slack.AckMessage:
73 | prevTimestamp = ev.Timestamp
74 | case *slack.MessageEvent:
75 | if ev.Channel != channelID {
76 | break
77 | }
78 | text := ev.Text
79 | if len(text) > 1 || len(text) == 0 {
80 | break
81 | }
82 | r := []rune(text)[0]
83 | act := rune2ActionType(r)
84 |
85 | switch act {
86 | case LABEL_AS_POSITIVE:
87 | e.Annotate(model.POSITIVE)
88 | m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
89 | if err != nil {
90 | return err
91 | }
92 | rtm.AddReaction("heavy_plus_sign", slack.NewRefToMessage(channelID, prevTimestamp))
93 | case LABEL_AS_NEGATIVE:
94 | e.Annotate(model.NEGATIVE)
95 | m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
96 | if err != nil {
97 | return err
98 | }
99 | rtm.AddReaction("heavy_minus_sign", slack.NewRefToMessage(channelID, prevTimestamp))
100 | case SKIP:
101 | rtm.SendMessage(rtm.NewOutgoingMessage("Skiped this e", channelID))
102 | examples = util.RemoveExample(examples, *e)
103 | break
104 | case HELP:
105 | rtm.SendMessage(rtm.NewOutgoingMessage(ActionHelpDoc, channelID))
106 | case EXIT:
107 | rtm.SendMessage(rtm.NewOutgoingMessage("EXIT", channelID))
108 | break annotationLoop
109 | default:
110 | break annotationLoop
111 | }
112 | e = NextExampleToBeAnnotated(*m, examples)
113 | if e == nil {
114 | return errors.New("No e to annotate")
115 | }
116 | showExample(rtm, *m, e, channelID)
117 | case *slack.InvalidAuthEvent:
118 | return errors.New("Invalid credentials")
119 | default:
120 | }
121 | }
122 | }
123 | return nil
124 | }
125 |
126 | func showExample(rtm *slack.RTM, model classifier.MIRAClassifier, example *model.Example, channelID string) {
127 | activeFeaturesStr := "Active Features: "
128 | for _, pair := range SortedActiveFeatures(model, *example, 5) {
129 | activeFeaturesStr += fmt.Sprintf("%s(%+0.1f) ", pair.Feature, pair.Weight)
130 | }
131 | rtm.SendMessage(rtm.NewOutgoingMessage(fmt.Sprintf("%s\nScore: %+0.2f\n%s", example.Url, example.Score, activeFeaturesStr), channelID))
132 | }
133 |
--------------------------------------------------------------------------------
/lib/classifier/mira.go:
--------------------------------------------------------------------------------
1 | package classifier
2 |
3 | import (
4 | "fmt"
5 | "math"
6 | "math/rand"
7 | "os"
8 | "runtime"
9 | "sort"
10 | "sync"
11 |
12 | "github.com/pkg/errors"
13 | "github.com/syou6162/go-active-learning/lib/evaluation"
14 | "github.com/syou6162/go-active-learning/lib/feature"
15 | "github.com/syou6162/go-active-learning/lib/model"
16 | "github.com/syou6162/go-active-learning/lib/util"
17 | )
18 |
19 | type ModelType int
20 |
21 | const (
22 | EXAMPLE ModelType = 0
23 | TWITTER ModelType = 1
24 | )
25 |
26 | type MIRAClassifier struct {
27 | ModelType ModelType `json:"ModelType"`
28 | Weight map[string]float64 `json:"Weight"`
29 | C float64 `json:"C"`
30 | Accuracy float64 `json:"Accuracy"`
31 | Precision float64 `json:"Precision"`
32 | Recall float64 `json:"Recall"`
33 | Fvalue float64 `json:"Fvalue"`
34 | }
35 |
36 | type LearningInstance interface {
37 | GetFeatureVector() feature.FeatureVector
38 | GetLabel() model.LabelType
39 | }
40 |
41 | type LearningInstances []LearningInstance
42 |
43 | var errNoTrainingInstances = errors.New("Empty training set")
44 | var errNoDevelopmentInstances = errors.New("Empty development set")
45 | var errNoMIRAModelLearned = errors.New("Fail to learn MIRA models")
46 | var errModelEvaluationFailure = errors.New("Failed to evaluate best MIRA")
47 | var errTrainingInstancesAllPositive = errors.New("Labels of training instances are all positive")
48 | var errTrainingInstancesAllNegative = errors.New("Labels of training instances are all negative")
49 | var errDevelopmentInstancesAllPositive = errors.New("Labels of development instances are all positive")
50 | var errDevelopmentInstancesAllNegative = errors.New("Labels of development instances are all negative")
51 |
52 | func newMIRAClassifier(modelType ModelType, c float64) *MIRAClassifier {
53 | return &MIRAClassifier{
54 | ModelType: modelType,
55 | Weight: make(map[string]float64),
56 | C: c,
57 | Accuracy: 0.0,
58 | Precision: 0.0,
59 | Recall: 0.0,
60 | Fvalue: 0.0,
61 | }
62 | }
63 |
64 | func filterLabeledInstances(instances LearningInstances) LearningInstances {
65 | var result LearningInstances
66 | for _, i := range instances {
67 | if i.GetLabel() != 0 {
68 | result = append(result, i)
69 | }
70 | }
71 | return result
72 | }
73 |
74 | func shuffle(instances LearningInstances) {
75 | n := len(instances)
76 | for i := n - 1; i >= 0; i-- {
77 | j := rand.Intn(i + 1)
78 | instances[i], instances[j] = instances[j], instances[i]
79 | }
80 | }
81 |
82 | func splitTrainAndDev(instances LearningInstances) (train LearningInstances, dev LearningInstances) {
83 | shuffle(instances)
84 | n := int(0.8 * float64(len(instances)))
85 | return instances[0:n], instances[n:]
86 | }
87 |
88 | func NewMIRAClassifier(modelType ModelType, instances LearningInstances, c float64) *MIRAClassifier {
89 | train := filterLabeledInstances(instances)
90 | model := newMIRAClassifier(modelType, c)
91 | for iter := 0; iter < 30; iter++ {
92 | shuffle(train)
93 | for _, example := range train {
94 | model.learn(example)
95 | }
96 | }
97 | return model
98 | }
99 |
100 | func overSamplingPositiveExamples(instances LearningInstances) LearningInstances {
101 | overSampled := LearningInstances{}
102 | posInstances := LearningInstances{}
103 | negInstances := LearningInstances{}
104 |
105 | numNeg := 0
106 |
107 | for _, i := range instances {
108 | if i.GetLabel() == model.NEGATIVE {
109 | numNeg += 1
110 | negInstances = append(negInstances, i)
111 | } else if i.GetLabel() == model.POSITIVE {
112 | posInstances = append(posInstances, i)
113 | }
114 | }
115 |
116 | for len(overSampled) <= numNeg {
117 | shuffle(posInstances)
118 | overSampled = append(overSampled, posInstances[0])
119 | }
120 | overSampled = append(overSampled, negInstances...)
121 | shuffle(overSampled)
122 |
123 | return overSampled
124 | }
125 |
126 | func extractGoldLabels(instances LearningInstances) []model.LabelType {
127 | golds := make([]model.LabelType, 0, 0)
128 | for _, i := range instances {
129 | golds = append(golds, i.GetLabel())
130 | }
131 | return golds
132 | }
133 |
134 | type MIRAClassifierList []MIRAClassifier
135 |
136 | func (l MIRAClassifierList) Len() int { return len(l) }
137 | func (l MIRAClassifierList) Less(i, j int) bool { return l[i].Fvalue < l[j].Fvalue }
138 | func (l MIRAClassifierList) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
139 |
140 | func allSameLabel(instances LearningInstances, label model.LabelType) bool {
141 | for _, instance := range instances {
142 | if instance.GetLabel() != label {
143 | return false
144 | }
145 | }
146 | return true
147 | }
148 |
149 | func isValidTrainAndDevelopmentInstances(train LearningInstances, dev LearningInstances) (bool, error) {
150 | if len(train) == 0 {
151 | return false, errNoTrainingInstances
152 | }
153 | if len(dev) == 0 {
154 | return false, errNoDevelopmentInstances
155 | }
156 |
157 | if allSameLabel(train, model.POSITIVE) {
158 | return false, errTrainingInstancesAllPositive
159 | }
160 | if allSameLabel(train, model.NEGATIVE) {
161 | return false, errTrainingInstancesAllNegative
162 | }
163 | if allSameLabel(dev, model.POSITIVE) {
164 | return false, errDevelopmentInstancesAllPositive
165 | }
166 | if allSameLabel(dev, model.NEGATIVE) {
167 | return false, errDevelopmentInstancesAllNegative
168 | }
169 |
170 | return true, nil
171 | }
172 |
173 | func NewMIRAClassifierByCrossValidation(modelType ModelType, instances LearningInstances) (*MIRAClassifier, error) {
174 | shuffle(instances)
175 | train, dev := splitTrainAndDev(filterLabeledInstances(instances))
176 | if valid, err := isValidTrainAndDevelopmentInstances(train, dev); !valid {
177 | return nil, err
178 | }
179 |
180 | train = overSamplingPositiveExamples(train)
181 |
182 | params := []float64{1000, 500, 100, 50, 10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001}
183 | miraResults := MIRAClassifierList{}
184 |
185 | wg := &sync.WaitGroup{}
186 | cpus := runtime.NumCPU()
187 | runtime.GOMAXPROCS(cpus)
188 |
189 | models := make([]*MIRAClassifier, len(params))
190 | for idx, c := range params {
191 | wg.Add(1)
192 | go func(idx int, c float64) {
193 | defer wg.Done()
194 | model := NewMIRAClassifier(modelType, train, c)
195 | models[idx] = model
196 | }(idx, c)
197 | }
198 | wg.Wait()
199 |
200 | if len(models) == 0 {
201 | return nil, errNoMIRAModelLearned
202 | }
203 |
204 | maxFvalue := math.Inf(-1)
205 | for _, m := range models {
206 | devPredicts := make([]model.LabelType, len(dev))
207 | for i, instance := range dev {
208 | devPredicts[i] = m.Predict(instance.GetFeatureVector())
209 | }
210 | m.Accuracy = evaluation.GetAccuracy(extractGoldLabels(dev), devPredicts)
211 | m.Precision = evaluation.GetPrecision(extractGoldLabels(dev), devPredicts)
212 | m.Recall = evaluation.GetRecall(extractGoldLabels(dev), devPredicts)
213 | m.Fvalue = (2 * m.Recall * m.Precision) / (m.Recall + m.Precision)
214 | fmt.Fprintln(os.Stderr, fmt.Sprintf("C:%0.03f\tAccuracy:%0.03f\tPrecision:%0.03f\tRecall:%0.03f\tF-value:%0.03f", m.C, m.Accuracy, m.Precision, m.Recall, m.Fvalue))
215 | tp, fp, fn, tn := evaluation.GetConfusionMatrix(extractGoldLabels(dev), devPredicts)
216 | fmt.Fprintln(os.Stderr, fmt.Sprintf("tp:%d\tfp:%d\tfn:%d\ttn:%d", tp, fp, fn, tn))
217 | if math.IsNaN(m.Fvalue) {
218 | continue
219 | }
220 | miraResults = append(miraResults, *m)
221 | if m.Fvalue >= maxFvalue {
222 | maxFvalue = m.Fvalue
223 | }
224 | }
225 | if len(miraResults) == 0 {
226 | return nil, errModelEvaluationFailure
227 | }
228 |
229 | sort.Sort(sort.Reverse(miraResults))
230 | bestModel := &miraResults[0]
231 | instances = overSamplingPositiveExamples(instances)
232 | shuffle(instances)
233 | result := NewMIRAClassifier(modelType, filterLabeledInstances(instances), bestModel.C)
234 | result.Accuracy = bestModel.Accuracy
235 | result.Precision = bestModel.Precision
236 | result.Recall = bestModel.Recall
237 | result.Fvalue = bestModel.Fvalue
238 | return result, nil
239 | }
240 |
241 | func (m *MIRAClassifier) learn(instance LearningInstance) {
242 | tmp := float64(instance.GetLabel()) * m.PredictScore(instance.GetFeatureVector()) // y w^T x
243 | loss := 0.0
244 | if tmp < 1.0 {
245 | loss = 1 - tmp
246 | }
247 |
248 | norm := float64(len(instance.GetFeatureVector()) * len(instance.GetFeatureVector()))
249 | // tau := math.Min(m.C, loss/norm) // update by PA-I
250 | tau := loss / (norm + 1.0/m.C) // update by PA-II
251 |
252 | if tau != 0.0 {
253 | for _, f := range instance.GetFeatureVector() {
254 | w, _ := m.Weight[f]
255 | m.Weight[f] = w + tau*float64(instance.GetLabel())
256 | }
257 | }
258 | }
259 |
260 | func (m MIRAClassifier) PredictScore(features feature.FeatureVector) float64 {
261 | result := 0.0
262 | for _, f := range features {
263 | w, ok := m.Weight[f]
264 | if ok {
265 | result = result + w*1.0
266 | }
267 | }
268 | return result
269 | }
270 |
271 | func (m MIRAClassifier) Predict(features feature.FeatureVector) model.LabelType {
272 | if m.PredictScore(features) > 0 {
273 | return model.POSITIVE
274 | }
275 | return model.NEGATIVE
276 | }
277 |
278 | func (m MIRAClassifier) SortByScore(examples model.Examples) model.Examples {
279 | var unlabeledExamples model.Examples
280 | for _, e := range util.FilterUnlabeledExamples(examples) {
281 | e.Score = m.PredictScore(e.Fv)
282 | if !e.IsLabeled() && e.Score != 0.0 {
283 | unlabeledExamples = append(unlabeledExamples, e)
284 | }
285 | }
286 |
287 | sort.Sort(unlabeledExamples)
288 | return unlabeledExamples
289 | }
290 |
291 | func (m MIRAClassifier) GetWeight(f string) float64 {
292 | w, ok := m.Weight[f]
293 | if ok {
294 | return w
295 | }
296 | return 0.0
297 | }
298 |
299 | func (m MIRAClassifier) GetActiveFeatures() []string {
300 | result := make([]string, 0)
301 | for f := range m.Weight {
302 | result = append(result, f)
303 | }
304 | return result
305 | }
306 |
--------------------------------------------------------------------------------
/lib/classifier/mira_test.go:
--------------------------------------------------------------------------------
1 | package classifier
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/example"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | )
9 |
10 | func TestPredictScore(t *testing.T) {
11 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
12 | e1.Title = "bookmark"
13 | e1.Fv = []string{"hoge", "fuga"}
14 | e2 := example.NewExample("http://google.com", model.NEGATIVE)
15 | e2.Title = "google"
16 | e2.Fv = []string{"piyo", "aaa"}
17 | e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE)
18 | e3.Title = "hatena"
19 | e3.Fv = []string{"hoge", "fuga"}
20 | e4 := example.NewExample("http://hogehoge.com", model.UNLABELED)
21 | e4.Title = "hogehoge"
22 | e4.Fv = []string{"piyo", "hoge"}
23 |
24 | examples := LearningInstances{e1, e2, e3, e4}
25 | c := NewMIRAClassifier(EXAMPLE, examples, 1.0)
26 |
27 | if c.PredictScore(e4.Fv) < 0.0 {
28 | t.Errorf("c.PredictScore(e4.Fv) == %f, want >= 0", c.PredictScore(e4.Fv))
29 | }
30 | }
31 |
32 | func TestSplitTrainAndDev(t *testing.T) {
33 | e1 := example.NewExample("http://a.hatena.ne.jp", model.POSITIVE)
34 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
35 | e3 := example.NewExample("http://google.com", model.UNLABELED)
36 | e4 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE)
37 | e5 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
38 | e6 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE)
39 | e7 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
40 | e8 := example.NewExample("http://google.com", model.UNLABELED)
41 | e9 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE)
42 | e10 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
43 |
44 | train, dev := splitTrainAndDev(LearningInstances{e1, e2, e3, e4, e5, e6, e7, e8, e9, e10})
45 | if len(train) != 8 {
46 | t.Error("Number of training examples should be 8")
47 | }
48 | if len(dev) != 2 {
49 | t.Error("Number of dev examples should be 2")
50 | }
51 | }
52 |
53 | func TestGetWeight(t *testing.T) {
54 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
55 | e1.Title = "bookmark"
56 | e1.Fv = []string{"hoge", "fuga"}
57 | e2 := example.NewExample("http://google.com", model.NEGATIVE)
58 | e2.Title = "google"
59 | e2.Fv = []string{"piyo", "aaa"}
60 | e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE)
61 | e3.Title = "hatena"
62 | e3.Fv = []string{"hoge", "fuga"}
63 | e4 := example.NewExample("http://hogehoge.com", model.UNLABELED)
64 | e4.Title = "hogehoge"
65 | e4.Fv = []string{"piyo", "hoge"}
66 |
67 | examples := LearningInstances{e1, e2, e3, e4}
68 | c := NewMIRAClassifier(EXAMPLE, examples, 1.0)
69 |
70 | if c.GetWeight("hoge") <= 0.0 {
71 | t.Errorf("c.GetWeight('hoge') == %f, want > 0", c.GetWeight("hoge"))
72 | }
73 | }
74 |
75 | func TestGetActiveFeatures(t *testing.T) {
76 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
77 | e1.Title = "bookmark"
78 | e1.Fv = []string{"hoge", "fuga"}
79 | e2 := example.NewExample("http://google.com", model.NEGATIVE)
80 | e2.Title = "google"
81 | e2.Fv = []string{"piyo", "aaa"}
82 | e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE)
83 | e3.Title = "hatena"
84 | e3.Fv = []string{"hoge", "fuga"}
85 | e4 := example.NewExample("http://hogehoge.com", model.UNLABELED)
86 | e4.Title = "hogehoge"
87 | e4.Fv = []string{"piyo", "hoge"}
88 |
89 | examples := LearningInstances{e1, e2, e3, e4}
90 | c := NewMIRAClassifier(EXAMPLE, examples, 1.0)
91 |
92 | if len(c.GetActiveFeatures()) <= 0 {
93 | t.Errorf("len(c.GetActiveFeatures()) <= %d, want > 0", len(c.GetActiveFeatures()))
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/lib/command/command.go:
--------------------------------------------------------------------------------
1 | package command
2 |
3 | import (
4 | "github.com/syou6162/go-active-learning/lib/add"
5 | "github.com/syou6162/go-active-learning/lib/annotation"
6 | "github.com/syou6162/go-active-learning/lib/diagnosis"
7 | "github.com/syou6162/go-active-learning/lib/related_example"
8 | "github.com/syou6162/go-active-learning/lib/top_accessed_example"
9 | "github.com/urfave/cli"
10 | )
11 |
12 | var Commands = []cli.Command{
13 | add.CommandAdd,
14 | related_example.CommandAddRelatedExamples,
15 | annotation.CommandAnnotate,
16 | top_accessed_example.CommandAddTopAccessedExamples,
17 | diagnosis.CommandDiagnose,
18 | }
19 |
--------------------------------------------------------------------------------
/lib/diagnosis/diagnosis.go:
--------------------------------------------------------------------------------
1 | package diagnosis
2 |
3 | import (
4 | featureweight "github.com/syou6162/go-active-learning/lib/diagnosis/feature_weight"
5 | labelconflict "github.com/syou6162/go-active-learning/lib/diagnosis/label_conflict"
6 | "github.com/urfave/cli"
7 | )
8 |
9 | var CommandDiagnose = cli.Command{
10 | Name: "diagnose",
11 | Usage: "Diagnose training data or learned model",
12 | Description: `
13 | Diagnose training data or learned model. This mode has two subcommand: label-conflict and feature-weight.
14 | `,
15 |
16 | Subcommands: []cli.Command{
17 | {
18 | Name: "label-conflict",
19 | Usage: "Diagnose label conflicts in training data",
20 | Description: `
21 | Diagnose label conflicts in training data. 'conflict' means that an annotated label is '-1/1', but a predicted label by model is '1/-1'.
22 | `,
23 | Action: labelconflict.DoLabelConflict,
24 | Flags: []cli.Flag{
25 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
26 | },
27 | },
28 | {
29 | Name: "feature-weight",
30 | Usage: "List feature weight",
31 | Description: `
32 | List feature weight.
33 | `,
34 | Action: featureweight.DoListFeatureWeight,
35 | Flags: []cli.Flag{
36 | cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
37 | },
38 | },
39 | },
40 | }
41 |
--------------------------------------------------------------------------------
/lib/diagnosis/feature_weight/feature_weight.go:
--------------------------------------------------------------------------------
1 | package featureweight
2 |
3 | import (
4 | "fmt"
5 | "sort"
6 |
7 | "github.com/syou6162/go-active-learning/lib/classifier"
8 | "github.com/syou6162/go-active-learning/lib/service"
9 | "github.com/syou6162/go-active-learning/lib/util"
10 | "github.com/syou6162/go-active-learning/lib/util/converter"
11 | "github.com/urfave/cli"
12 | )
13 |
14 | type Feature struct {
15 | Key string
16 | Weight float64
17 | }
18 |
19 | type FeatureList []Feature
20 |
21 | func (p FeatureList) Len() int { return len(p) }
22 | func (p FeatureList) Less(i, j int) bool { return p[i].Weight < p[j].Weight }
23 | func (p FeatureList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
24 |
25 | func DoListFeatureWeight(c *cli.Context) error {
26 | filterStatusCodeOk := c.Bool("filter-status-code-ok")
27 |
28 | app, err := service.NewDefaultApp()
29 | if err != nil {
30 | return err
31 | }
32 | defer app.Close()
33 |
34 | examples, err := app.SearchExamples()
35 | if err != nil {
36 | return err
37 | }
38 | app.Fetch(examples)
39 | for _, e := range examples {
40 | app.UpdateFeatureVector(e)
41 | }
42 | training := util.FilterLabeledExamples(examples)
43 |
44 | if filterStatusCodeOk {
45 | training = util.FilterStatusCodeOkExamples(training)
46 | }
47 |
48 | model, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(training))
49 | if err != nil {
50 | return err
51 | }
52 |
53 | tmp := make(FeatureList, 0)
54 | for _, k := range model.GetActiveFeatures() {
55 | tmp = append(tmp, Feature{k, model.GetWeight(k)})
56 | }
57 | sort.Sort(sort.Reverse(tmp))
58 |
59 | for _, p := range tmp {
60 | fmt.Println(fmt.Sprintf("%+0.2f\t%s", p.Weight, p.Key))
61 | }
62 |
63 | return nil
64 | }
65 |
--------------------------------------------------------------------------------
/lib/diagnosis/feature_weight/feature_weight_test.go:
--------------------------------------------------------------------------------
1 | package featureweight_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/command"
7 | "github.com/syou6162/go-active-learning/lib/service"
8 | "github.com/syou6162/go-active-learning/lib/util/file"
9 | "github.com/urfave/cli"
10 | )
11 |
12 | func TestDoListFeatureWeight(t *testing.T) {
13 | inputFilename := "../../../tech_input_example.txt"
14 | train, err := file.ReadExamples(inputFilename)
15 | if err != nil {
16 | t.Error(err)
17 | }
18 |
19 | a, err := service.NewDefaultApp()
20 | if err != nil {
21 | t.Error(err)
22 | }
23 | defer a.Close()
24 |
25 | if err = a.DeleteAllExamples(); err != nil {
26 | t.Error(err)
27 | }
28 |
29 | for _, example := range train {
30 | if err = a.UpdateOrCreateExample(example); err != nil {
31 | t.Error(err)
32 | }
33 | }
34 |
35 | app := cli.NewApp()
36 | app.Commands = command.Commands
37 | args := []string{
38 | "go-active-learning",
39 | "diagnose",
40 | "feature-weight",
41 | "--filter-status-code-ok",
42 | }
43 |
44 | if err := app.Run(args); err != nil {
45 | t.Error(err)
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/lib/diagnosis/label_conflict/label_conflict.go:
--------------------------------------------------------------------------------
1 | package labelconflict
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "sort"
7 | "strconv"
8 |
9 | "encoding/csv"
10 |
11 | "github.com/syou6162/go-active-learning/lib/classifier"
12 | "github.com/syou6162/go-active-learning/lib/model"
13 | "github.com/syou6162/go-active-learning/lib/service"
14 | "github.com/syou6162/go-active-learning/lib/util"
15 | "github.com/syou6162/go-active-learning/lib/util/converter"
16 | "github.com/urfave/cli"
17 | )
18 |
19 | func DoLabelConflict(c *cli.Context) error {
20 | filterStatusCodeOk := c.Bool("filter-status-code-ok")
21 |
22 | app, err := service.NewDefaultApp()
23 | if err != nil {
24 | return err
25 | }
26 | defer app.Close()
27 |
28 | examples, err := app.SearchExamples()
29 | if err != nil {
30 | return err
31 | }
32 | app.Fetch(examples)
33 | for _, e := range examples {
34 | app.UpdateFeatureVector(e)
35 | }
36 | training := util.FilterLabeledExamples(examples)
37 |
38 | if filterStatusCodeOk {
39 | training = util.FilterStatusCodeOkExamples(training)
40 | }
41 |
42 | m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(training))
43 | if err != nil {
44 | return err
45 | }
46 |
47 | wrongExamples := model.Examples{}
48 | correctExamples := model.Examples{}
49 |
50 | for _, e := range training {
51 | e.Score = m.PredictScore(e.Fv)
52 | if float64(e.Label)*e.Score < 0 {
53 | wrongExamples = append(wrongExamples, e)
54 | } else {
55 | correctExamples = append(correctExamples, e)
56 | }
57 | }
58 |
59 | sort.Sort(sort.Reverse(wrongExamples))
60 | sort.Sort(correctExamples)
61 | printResult(*m, correctExamples, wrongExamples)
62 |
63 | return nil
64 | }
65 |
66 | func printResult(m classifier.MIRAClassifier, correctExamples model.Examples, wrongExamples model.Examples) error {
67 | fmt.Println("Index\tLabel\tScore\tURL\tTitle")
68 | result := append(wrongExamples, correctExamples...)
69 |
70 | w := csv.NewWriter(os.Stdout)
71 | w.Comma = '\t'
72 |
73 | for idx, e := range result {
74 | record := []string{
75 | strconv.Itoa(idx),
76 | strconv.Itoa(int(e.Label)),
77 | fmt.Sprintf("%0.03f", m.PredictScore(e.Fv)),
78 | e.Url,
79 | e.Title,
80 | }
81 | if err := w.Write(record); err != nil {
82 | return err
83 | }
84 | }
85 |
86 | w.Flush()
87 | if err := w.Error(); err != nil {
88 | return err
89 | }
90 |
91 | return nil
92 | }
93 |
--------------------------------------------------------------------------------
/lib/diagnosis/label_conflict/label_conflict_test.go:
--------------------------------------------------------------------------------
1 | package labelconflict_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/command"
7 | "github.com/syou6162/go-active-learning/lib/service"
8 | "github.com/syou6162/go-active-learning/lib/util/file"
9 | "github.com/urfave/cli"
10 | )
11 |
12 | func TestDoLabelConflict(t *testing.T) {
13 | inputFilename := "../../../tech_input_example.txt"
14 | train, err := file.ReadExamples(inputFilename)
15 | if err != nil {
16 | t.Error(err)
17 | }
18 |
19 | a, err := service.NewDefaultApp()
20 | if err != nil {
21 | t.Error(err)
22 | }
23 | defer a.Close()
24 |
25 | if err = a.DeleteAllExamples(); err != nil {
26 | t.Error(err)
27 | }
28 |
29 | for _, example := range train {
30 | if err = a.UpdateOrCreateExample(example); err != nil {
31 | t.Error(err)
32 | }
33 | }
34 |
35 | app := cli.NewApp()
36 | app.Commands = command.Commands
37 | args := []string{
38 | "go-active-learning",
39 | "diagnose",
40 | "label-conflict",
41 | }
42 |
43 | if err := app.Run(args); err != nil {
44 | t.Error(err)
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/lib/evaluation/evaluation.go:
--------------------------------------------------------------------------------
1 | package evaluation
2 |
3 | import (
4 | "github.com/syou6162/go-active-learning/lib/model"
5 | )
6 |
7 | func GetAccuracy(gold []model.LabelType, predict []model.LabelType) float64 {
8 | if len(gold) != len(predict) {
9 | return 0.0
10 | }
11 | sum := 0.0
12 | for i, v := range gold {
13 | if v == predict[i] {
14 | sum += 1.0
15 | }
16 | }
17 | return sum / float64(len(gold))
18 | }
19 |
20 | func GetPrecision(gold []model.LabelType, predict []model.LabelType) float64 {
21 | tp := 0.0
22 | fp := 0.0
23 | for i, v := range gold {
24 | if v == model.POSITIVE && predict[i] == model.POSITIVE {
25 | tp += 1.0
26 | }
27 | if v == model.NEGATIVE && predict[i] == model.POSITIVE {
28 | fp += 1.0
29 | }
30 | }
31 | return tp / (tp + fp)
32 | }
33 |
34 | func GetRecall(gold []model.LabelType, predict []model.LabelType) float64 {
35 | tp := 0.0
36 | fn := 0.0
37 | for i, v := range gold {
38 | if v == model.POSITIVE && predict[i] == model.POSITIVE {
39 | tp += 1.0
40 | }
41 | if v == model.POSITIVE && predict[i] == model.NEGATIVE {
42 | fn += 1.0
43 | }
44 | }
45 | return tp / (tp + fn)
46 | }
47 |
48 | func GetConfusionMatrix(gold []model.LabelType, predict []model.LabelType) (int, int, int, int) {
49 | tp := 0
50 | fp := 0
51 | fn := 0
52 | tn := 0
53 | for i, v := range gold {
54 | if v == model.POSITIVE && predict[i] == model.POSITIVE {
55 | tp += 1
56 | }
57 | if v == model.NEGATIVE && predict[i] == model.POSITIVE {
58 | fp += 1
59 | }
60 | if v == model.POSITIVE && predict[i] == model.NEGATIVE {
61 | fn += 1
62 | }
63 | if v == model.NEGATIVE && predict[i] == model.NEGATIVE {
64 | tn += 1
65 | }
66 | }
67 | return tp, fp, fn, tn
68 | }
69 |
--------------------------------------------------------------------------------
/lib/evaluation/evaluation_test.go:
--------------------------------------------------------------------------------
1 | package evaluation
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | )
9 |
10 | func TestGetAccuracy(t *testing.T) {
11 | gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE}
12 | predict := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.POSITIVE}
13 | accuracy := 0.75
14 |
15 | if GetAccuracy(gold, predict) != accuracy {
16 | t.Error(fmt.Printf("Accuracy should be %f", accuracy))
17 | }
18 | }
19 |
20 | func TestGetPrecision(t *testing.T) {
21 | gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE}
22 | predict := []model.LabelType{model.POSITIVE, model.NEGATIVE, model.NEGATIVE, model.POSITIVE}
23 | precision := 0.5
24 |
25 | if GetPrecision(gold, predict) != precision {
26 | t.Error(fmt.Printf("Precision should be %f", precision))
27 | }
28 | }
29 |
30 | func TestGetRecall(t *testing.T) {
31 | gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE}
32 | predict := []model.LabelType{model.POSITIVE, model.NEGATIVE, model.NEGATIVE, model.POSITIVE}
33 | recall := 0.5
34 |
35 | if GetRecall(gold, predict) != recall {
36 | t.Error(fmt.Printf("Recall should be %f", recall))
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/lib/example/example.go:
--------------------------------------------------------------------------------
1 | package example
2 |
3 | import (
4 | "time"
5 |
6 | "github.com/syou6162/go-active-learning/lib/feature"
7 | example_feature "github.com/syou6162/go-active-learning/lib/feature/example"
8 | "github.com/syou6162/go-active-learning/lib/model"
9 | )
10 |
11 | func NewExample(url string, label model.LabelType) *model.Example {
12 | IsNew := false
13 | if label == model.UNLABELED {
14 | IsNew = true
15 | }
16 | now := time.Now()
17 | return &model.Example{
18 | Label: label,
19 | Fv: feature.FeatureVector{},
20 | Url: url,
21 | FinalUrl: url,
22 | Title: "",
23 | Description: "",
24 | OgDescription: "",
25 | OgType: "",
26 | OgImage: "",
27 | Body: "",
28 | Score: 0.0,
29 | IsNew: IsNew,
30 | StatusCode: 0,
31 | Favicon: "",
32 | ErrorCount: 0,
33 | CreatedAt: now,
34 | UpdatedAt: now,
35 | ReferringTweets: &model.ReferringTweets{},
36 | HatenaBookmark: &model.HatenaBookmark{Bookmarks: make([]*model.Bookmark, 0)},
37 | }
38 | }
39 |
40 | func GetStat(examples model.Examples) map[string]int {
41 | stat := make(map[string]int)
42 | for _, e := range examples {
43 | switch e.Label {
44 | case model.POSITIVE:
45 | stat["positive"]++
46 | case model.NEGATIVE:
47 | stat["negative"]++
48 | case model.UNLABELED:
49 | stat["unlabeled"]++
50 | }
51 | }
52 | return stat
53 | }
54 |
55 | func ExtractFeatures(e model.Example) feature.FeatureVector {
56 | var fv feature.FeatureVector
57 | fv = append(fv, "BIAS")
58 | fv = append(fv, example_feature.ExtractHostFeature(e.FinalUrl))
59 | fv = append(fv, example_feature.ExtractJpnNounFeatures(example_feature.ExtractPath(e.FinalUrl), "URL")...)
60 | fv = append(fv, example_feature.ExtractNounFeatures(e.Title, "TITLE")...)
61 | fv = append(fv, example_feature.ExtractNounFeatures(e.Description, "DESCRIPTION")...)
62 | fv = append(fv, example_feature.ExtractNounFeatures(e.Body, "BODY")...)
63 | return fv
64 | }
65 |
--------------------------------------------------------------------------------
/lib/feature/example/example.go:
--------------------------------------------------------------------------------
1 | package example_feature
2 |
3 | import (
4 | "net/url"
5 | "strings"
6 | "sync"
7 | "unicode"
8 |
9 | "github.com/ikawaha/kagome/tokenizer"
10 | "github.com/jdkato/prose/tag"
11 | "github.com/jdkato/prose/tokenize"
12 | "github.com/syou6162/go-active-learning/lib/feature"
13 | )
14 |
15 | var excludingWordList = []string{
16 | `:`, `;`,
17 | `,`, `.`,
18 | `"`, `''`,
19 | `+`, `-`, `*`, `/`, `|`, `++`, `--`,
20 | `[`, `]`,
21 | `{`, `}`,
22 | `(`, `)`,
23 | `<`, `>`,
24 | `「`, `」`,
25 | `/`,
26 | `@`, `#`, `~`, `%`, `$`, `^`,
27 | }
28 |
29 | var (
30 | japaneseTokenizer *tokenizer.Tokenizer
31 | japaneseTokenizerOnce sync.Once
32 | englishTokenizer *tokenize.TreebankWordTokenizer
33 | englishTokenizerOnce sync.Once
34 | englishTagger *tag.PerceptronTagger
35 | englishTaggerOnce sync.Once
36 | excludingWordMapOnce sync.Once
37 | )
38 |
39 | var excludingWordMap = make(map[string]bool)
40 |
41 | func GetJapaneseTokenizer() *tokenizer.Tokenizer {
42 | japaneseTokenizerOnce.Do(func() {
43 | t := tokenizer.New()
44 | japaneseTokenizer = &t
45 | })
46 |
47 | return japaneseTokenizer
48 | }
49 |
50 | func GetEnglishTokenizer() *tokenize.TreebankWordTokenizer {
51 | englishTokenizerOnce.Do(func() {
52 | englishTokenizer = tokenize.NewTreebankWordTokenizer()
53 | })
54 | return englishTokenizer
55 | }
56 |
57 | func GetEnglishTagger() *tag.PerceptronTagger {
58 | englishTaggerOnce.Do(func() {
59 | englishTagger = tag.NewPerceptronTagger()
60 | })
61 | return englishTagger
62 | }
63 |
64 | func isJapanese(str string) bool {
65 | for _, r := range str {
66 | if unicode.In(r, unicode.Hiragana) || unicode.In(r, unicode.Katakana) || unicode.In(r, unicode.Han) {
67 | return true
68 | }
69 | }
70 |
71 | if strings.ContainsAny(str, "。、") {
72 | return true
73 | }
74 |
75 | return false
76 | }
77 |
78 | func IsExcludingWord(w string) bool {
79 | excludingWordMapOnce.Do(func() {
80 | for _, w := range excludingWordList {
81 | excludingWordMap[w] = true
82 | }
83 | })
84 | if _, ok := excludingWordMap[w]; ok {
85 | return true
86 | }
87 | return false
88 | }
89 |
90 | func extractEngNounFeaturesWithoutPrefix(s string) feature.FeatureVector {
91 | var fv feature.FeatureVector
92 | if s == "" {
93 | return fv
94 | }
95 |
96 | words := GetEnglishTokenizer().Tokenize(s)
97 | tagger := GetEnglishTagger()
98 | for _, tok := range tagger.Tag(words) {
99 | if IsExcludingWord(tok.Text) {
100 | continue
101 | }
102 | switch tok.Tag {
103 | // https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
104 | case "NN", "NNS", "NNP", "NNPS", "PRP", "PRP$":
105 | fv = append(fv, strings.ToLower(tok.Text))
106 | }
107 | }
108 |
109 | return fv
110 | }
111 |
112 | func extractEngNounFeatures(s string, prefix string) feature.FeatureVector {
113 | var fv feature.FeatureVector
114 | for _, surface := range extractEngNounFeaturesWithoutPrefix(s) {
115 | fv = append(fv, prefix+":"+surface)
116 | }
117 | return fv
118 | }
119 |
120 | func ExtractJpnNounFeaturesWithoutPrefix(s string) feature.FeatureVector {
121 | var fv feature.FeatureVector
122 | if s == "" {
123 | return fv
124 | }
125 | t := GetJapaneseTokenizer()
126 | tokens := t.Tokenize(strings.ToLower(s))
127 | for _, token := range tokens {
128 | if token.Pos() == "名詞" {
129 | surface := token.Surface
130 | if len(token.Features()) >= 2 && token.Features()[1] == "数" {
131 | surface = "NUM"
132 | }
133 | if IsExcludingWord(surface) {
134 | continue
135 | }
136 | fv = append(fv, surface)
137 | }
138 | }
139 | return fv
140 | }
141 |
142 | func ExtractJpnNounFeatures(s string, prefix string) feature.FeatureVector {
143 | var fv feature.FeatureVector
144 | for _, surface := range ExtractJpnNounFeaturesWithoutPrefix(s) {
145 | fv = append(fv, prefix+":"+surface)
146 | }
147 | return fv
148 | }
149 |
150 | func ExtractNounFeatures(s string, prefix string) feature.FeatureVector {
151 | if isJapanese(s) {
152 | return ExtractJpnNounFeatures(s, prefix)
153 | } else {
154 | return extractEngNounFeatures(s, prefix)
155 | }
156 | }
157 |
158 | func ExtractNounFeaturesWithoutPrefix(s string) feature.FeatureVector {
159 | if isJapanese(s) {
160 | return ExtractJpnNounFeaturesWithoutPrefix(s)
161 | } else {
162 | return extractEngNounFeaturesWithoutPrefix(s)
163 | }
164 | }
165 |
166 | func ExtractHostFeature(urlString string) string {
167 | prefix := "HOST"
168 | u, err := url.Parse(urlString)
169 | if err != nil {
170 | return prefix + ":INVALID_HOST"
171 | }
172 | return prefix + ":" + u.Host
173 | }
174 |
175 | func ExtractPath(urlString string) string {
176 | path := ""
177 | u, err := url.Parse(urlString)
178 | if err != nil {
179 | return path
180 | }
181 | return u.Path
182 | }
183 |
--------------------------------------------------------------------------------
/lib/feature/example/example_test.go:
--------------------------------------------------------------------------------
1 | package example_feature
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | )
7 |
8 | func TestIsJapanese(t *testing.T) {
9 | text := "ほげ"
10 | if !isJapanese(text) {
11 | t.Error(fmt.Printf("%s should be Japanese", text))
12 | }
13 | text = "文献紹介 / Youtube"
14 | if !isJapanese(text) {
15 | t.Error(fmt.Printf("%s should be Japanese", text))
16 | }
17 | text = "This is a pen."
18 | if isJapanese(text) {
19 | t.Error(fmt.Printf("%s should be not Japanese", text))
20 | }
21 | }
22 |
23 | func TestJapaneseNounFeatures(t *testing.T) {
24 | text := "日本語のテストです"
25 | fv := ExtractJpnNounFeaturesWithoutPrefix(text)
26 | if len(fv) != 2 {
27 | t.Error(fmt.Printf("Size of feature vector for %s should be 2, but %d", text, len(fv)))
28 | }
29 | text = "文献紹介 / Youtube"
30 | fv = ExtractJpnNounFeaturesWithoutPrefix(text)
31 | if len(fv) != 3 {
32 | t.Error(fmt.Printf("Size of feature vector for %s should be 3, but %d", text, len(fv)))
33 | }
34 | }
35 |
36 | func TestEngNounFeatures(t *testing.T) {
37 | text := "Hello World!"
38 | fv := extractEngNounFeatures(text, "")
39 | if len(fv) != 2 {
40 | t.Error(fmt.Printf("Size of feature vector for %s should be 2", text))
41 | }
42 | }
43 |
44 | func TestExtractPath(t *testing.T) {
45 | url := "https://b.hatena.ne.jp/search/text?safe=on&q=nlp&users=50"
46 | path := "/search/text"
47 | if ExtractPath(url) != path {
48 | t.Error(fmt.Printf("path should be %s", path))
49 | }
50 | }
51 |
52 | func TestExtractHostFeature(t *testing.T) {
53 | url := "https://b.hatena.ne.jp/search/text?safe=on&q=nlp&users=50"
54 | hostFeature := "HOST:b.hatena.ne.jp"
55 | if ExtractHostFeature(url) != hostFeature {
56 | t.Error(fmt.Printf("Host feature should be %s", hostFeature))
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/lib/feature/feature.go:
--------------------------------------------------------------------------------
1 | package feature
2 |
3 | import (
4 | "encoding/json"
5 | )
6 |
7 | type FeatureVector []string
8 |
9 | func (fv *FeatureVector) MarshalBinary() ([]byte, error) {
10 | json, err := json.Marshal(fv)
11 | if err != nil {
12 | return nil, err
13 | }
14 | return []byte(json), nil
15 | }
16 |
17 | func (fv *FeatureVector) UnmarshalBinary(data []byte) error {
18 | err := json.Unmarshal(data, fv)
19 | if err != nil {
20 | return err
21 | }
22 | return nil
23 | }
24 |
--------------------------------------------------------------------------------
/lib/feature/tweet/tweet.go:
--------------------------------------------------------------------------------
1 | package tweet_feature
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 |
7 | "github.com/syou6162/go-active-learning/lib/feature"
8 | "github.com/syou6162/go-active-learning/lib/model"
9 | "gopkg.in/vmarkovtsev/go-lcss.v1"
10 | )
11 |
12 | type ExampleAndTweet struct {
13 | example *model.Example
14 | tweet *model.Tweet
15 | lcsLen int
16 | atMarksCnt int
17 | hashTagsCnt int
18 | cleanedText string
19 | cleanedLcsLen int
20 | }
21 |
22 | func (et *ExampleAndTweet) GetLabel() model.LabelType {
23 | return et.tweet.Label
24 | }
25 |
26 | func GetExampleAndTweet(e *model.Example, t *model.Tweet) ExampleAndTweet {
27 | result := ExampleAndTweet{example: e, tweet: t}
28 | result.lcsLen = GetLCSLen(e.Title, t.FullText)
29 |
30 | atRegexp := regexp.MustCompile(`@[^ ]+`)
31 | result.atMarksCnt = len(atRegexp.FindAllStringSubmatch(t.FullText, -1))
32 | str := atRegexp.ReplaceAllString(t.FullText, "")
33 | hashRegexp := regexp.MustCompile(`#[^ ]+`)
34 | result.hashTagsCnt = len(hashRegexp.FindAllStringSubmatch(t.FullText, -1))
35 | result.cleanedText = hashRegexp.ReplaceAllString(str, "")
36 | result.cleanedLcsLen = GetLCSLen(e.Title, result.cleanedText)
37 | return result
38 | }
39 |
40 | func GetLCSLen(str1 string, str2 string) int {
41 | return len(string(lcss.LongestCommonSubstring([]byte(str1), []byte(str2))))
42 | }
43 |
44 | func LCSLenFeature(et ExampleAndTweet) string {
45 | prefix := "LCSLenFeature"
46 | len := et.lcsLen
47 | switch {
48 | case len == 0:
49 | return fmt.Sprintf("%s:0", prefix)
50 | case len < 5:
51 | return fmt.Sprintf("%s:5", prefix)
52 | case len < 10:
53 | return fmt.Sprintf("%s:10", prefix)
54 | case len < 25:
55 | return fmt.Sprintf("%s:25", prefix)
56 | case len < 50:
57 | return fmt.Sprintf("%s:50", prefix)
58 | case len < 100:
59 | return fmt.Sprintf("%s:100", prefix)
60 | default:
61 | return fmt.Sprintf("%s:INF", prefix)
62 | }
63 | }
64 |
65 | func CleanedLCSLenFeature(et ExampleAndTweet) string {
66 | prefix := "CleanedLCSLenFeature"
67 | len := et.cleanedLcsLen
68 | switch {
69 | case len == 0:
70 | return fmt.Sprintf("%s:0", prefix)
71 | case len < 5:
72 | return fmt.Sprintf("%s:5", prefix)
73 | case len < 10:
74 | return fmt.Sprintf("%s:10", prefix)
75 | case len < 25:
76 | return fmt.Sprintf("%s:25", prefix)
77 | case len < 50:
78 | return fmt.Sprintf("%s:50", prefix)
79 | case len < 100:
80 | return fmt.Sprintf("%s:100", prefix)
81 | default:
82 | return fmt.Sprintf("%s:INF", prefix)
83 | }
84 | }
85 |
86 | func LCSRatioFeature(et ExampleAndTweet) string {
87 | prefix := "LCSRatioFeature"
88 | ratio := float64(et.lcsLen) / float64(len(et.tweet.FullText))
89 | switch {
90 | case ratio == 0.0:
91 | return fmt.Sprintf("%s:0.0", prefix)
92 | case ratio < 0.1:
93 | return fmt.Sprintf("%s:0.1", prefix)
94 | case ratio < 0.25:
95 | return fmt.Sprintf("%s:0.25", prefix)
96 | case ratio < 0.5:
97 | return fmt.Sprintf("%s:0.5", prefix)
98 | case ratio < 0.75:
99 | return fmt.Sprintf("%s:0.75", prefix)
100 | case ratio < 0.9:
101 | return fmt.Sprintf("%s:0.0", prefix)
102 | default:
103 | return fmt.Sprintf("%s:1.0", prefix)
104 | }
105 | }
106 |
107 | func CleanedLCSRatioFeature(et ExampleAndTweet) string {
108 | prefix := "CleanedLCSRatioFeature"
109 | ratio := float64(et.cleanedLcsLen) / float64(len(et.tweet.FullText))
110 | switch {
111 | case ratio == 0.0:
112 | return fmt.Sprintf("%s:0.0", prefix)
113 | case ratio < 0.1:
114 | return fmt.Sprintf("%s:0.1", prefix)
115 | case ratio < 0.25:
116 | return fmt.Sprintf("%s:0.25", prefix)
117 | case ratio < 0.5:
118 | return fmt.Sprintf("%s:0.5", prefix)
119 | case ratio < 0.75:
120 | return fmt.Sprintf("%s:0.75", prefix)
121 | case ratio < 0.9:
122 | return fmt.Sprintf("%s:0.0", prefix)
123 | default:
124 | return fmt.Sprintf("%s:1.0", prefix)
125 | }
126 | }
127 |
128 | func FavoriteCountFeature(et ExampleAndTweet) string {
129 | prefix := "FavoriteCountFeature"
130 | cnt := et.tweet.FavoriteCount
131 | switch {
132 | case cnt == 0:
133 | return fmt.Sprintf("%s:0", prefix)
134 | case cnt == 1:
135 | return fmt.Sprintf("%s:1", prefix)
136 | case cnt <= 3:
137 | return fmt.Sprintf("%s:3", prefix)
138 | case cnt <= 5:
139 | return fmt.Sprintf("%s:5", prefix)
140 | case cnt <= 10:
141 | return fmt.Sprintf("%s:10", prefix)
142 | case cnt <= 25:
143 | return fmt.Sprintf("%s:25", prefix)
144 | case cnt <= 50:
145 | return fmt.Sprintf("%s:50", prefix)
146 | case cnt <= 100:
147 | return fmt.Sprintf("%s:100", prefix)
148 | default:
149 | return fmt.Sprintf("%s:INF", prefix)
150 | }
151 | }
152 |
153 | func RetweetCountFeature(et ExampleAndTweet) string {
154 | prefix := "RetweetCountFeature"
155 | cnt := et.tweet.RetweetCount
156 | switch {
157 | case cnt == 0:
158 | return fmt.Sprintf("%s:0", prefix)
159 | case cnt == 1:
160 | return fmt.Sprintf("%s:1", prefix)
161 | case cnt <= 3:
162 | return fmt.Sprintf("%s:3", prefix)
163 | case cnt <= 5:
164 | return fmt.Sprintf("%s:5", prefix)
165 | case cnt <= 10:
166 | return fmt.Sprintf("%s:10", prefix)
167 | case cnt <= 25:
168 | return fmt.Sprintf("%s:25", prefix)
169 | case cnt <= 50:
170 | return fmt.Sprintf("%s:50", prefix)
171 | case cnt <= 100:
172 | return fmt.Sprintf("%s:100", prefix)
173 | default:
174 | return fmt.Sprintf("%s:INF", prefix)
175 | }
176 | }
177 |
178 | func AtMarksCountFeature(et ExampleAndTweet) string {
179 | prefix := "AtMarksCountFeature"
180 | cnt := et.atMarksCnt
181 | switch {
182 | case cnt == 0:
183 | return fmt.Sprintf("%s:0", prefix)
184 | case cnt == 1:
185 | return fmt.Sprintf("%s:1", prefix)
186 | case cnt <= 3:
187 | return fmt.Sprintf("%s:3", prefix)
188 | case cnt <= 5:
189 | return fmt.Sprintf("%s:5", prefix)
190 | case cnt <= 10:
191 | return fmt.Sprintf("%s:10", prefix)
192 | default:
193 | return fmt.Sprintf("%s:INF", prefix)
194 | }
195 | }
196 |
197 | func HashTagsCountFeature(et ExampleAndTweet) string {
198 | prefix := "HashTagsCountFeature"
199 | cnt := et.atMarksCnt
200 | switch {
201 | case cnt == 0:
202 | return fmt.Sprintf("%s:0", prefix)
203 | case cnt == 1:
204 | return fmt.Sprintf("%s:1", prefix)
205 | case cnt <= 3:
206 | return fmt.Sprintf("%s:3", prefix)
207 | case cnt <= 5:
208 | return fmt.Sprintf("%s:5", prefix)
209 | case cnt <= 10:
210 | return fmt.Sprintf("%s:10", prefix)
211 | default:
212 | return fmt.Sprintf("%s:INF", prefix)
213 | }
214 | }
215 |
216 | func TextLengthFeature(et ExampleAndTweet) string {
217 | prefix := "TextLengthFeature"
218 | cnt := len(et.tweet.FullText)
219 | switch {
220 | case cnt == 0:
221 | return fmt.Sprintf("%s:0", prefix)
222 | case cnt == 1:
223 | return fmt.Sprintf("%s:1", prefix)
224 | case cnt == 3:
225 | return fmt.Sprintf("%s:3", prefix)
226 | case cnt < 5:
227 | return fmt.Sprintf("%s:5", prefix)
228 | case cnt < 10:
229 | return fmt.Sprintf("%s:10", prefix)
230 | case cnt < 25:
231 | return fmt.Sprintf("%s:25", prefix)
232 | case cnt < 50:
233 | return fmt.Sprintf("%s:50", prefix)
234 | case cnt < 100:
235 | return fmt.Sprintf("%s:100", prefix)
236 | default:
237 | return fmt.Sprintf("%s:INF", prefix)
238 | }
239 | }
240 |
241 | func CleanedTextLengthFeature(et ExampleAndTweet) string {
242 | prefix := "CleanedTextLengthFeature"
243 | cnt := len(et.cleanedText)
244 | switch {
245 | case cnt == 0:
246 | return fmt.Sprintf("%s:0", prefix)
247 | case cnt == 1:
248 | return fmt.Sprintf("%s:1", prefix)
249 | case cnt == 3:
250 | return fmt.Sprintf("%s:3", prefix)
251 | case cnt < 5:
252 | return fmt.Sprintf("%s:5", prefix)
253 | case cnt < 10:
254 | return fmt.Sprintf("%s:10", prefix)
255 | case cnt < 25:
256 | return fmt.Sprintf("%s:25", prefix)
257 | case cnt < 50:
258 | return fmt.Sprintf("%s:50", prefix)
259 | case cnt < 100:
260 | return fmt.Sprintf("%s:100", prefix)
261 | default:
262 | return fmt.Sprintf("%s:INF", prefix)
263 | }
264 | }
265 |
266 | func ScreenNameFeature(et ExampleAndTweet) string {
267 | prefix := "ScreenNameFeature"
268 | return fmt.Sprintf("%s:%s", prefix, et.tweet.ScreenName)
269 | }
270 |
271 | func (et *ExampleAndTweet) GetFeatureVector() feature.FeatureVector {
272 | var fv feature.FeatureVector
273 |
274 | fv = append(fv, "BIAS")
275 | fv = append(fv, LCSLenFeature(*et))
276 | fv = append(fv, CleanedLCSLenFeature(*et))
277 | fv = append(fv, LCSRatioFeature(*et))
278 | fv = append(fv, CleanedLCSRatioFeature(*et))
279 | fv = append(fv, TextLengthFeature(*et))
280 | fv = append(fv, CleanedTextLengthFeature(*et))
281 |
282 | fv = append(fv, ScreenNameFeature(*et))
283 | fv = append(fv, FavoriteCountFeature(*et))
284 | fv = append(fv, RetweetCountFeature(*et))
285 | fv = append(fv, AtMarksCountFeature(*et))
286 | fv = append(fv, HashTagsCountFeature(*et))
287 | return fv
288 | }
289 |
--------------------------------------------------------------------------------
/lib/feature/tweet/tweet_test.go:
--------------------------------------------------------------------------------
1 | package tweet_feature
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 |
7 | "github.com/syou6162/go-active-learning/lib/feature"
8 | "github.com/syou6162/go-active-learning/lib/model"
9 | )
10 |
11 | func TestExtractHostFeature(t *testing.T) {
12 | e := model.Example{}
13 | e.Title = "Hello world"
14 | tweet := model.Tweet{}
15 | tweet.ScreenName = "syou6162"
16 | tweet.FullText = "Hello world @syou6162 @syou6163 #hashtag1 #hashtag2"
17 | tweet.FavoriteCount = 7
18 | tweet.RetweetCount = 7
19 |
20 | et := GetExampleAndTweet(&e, &tweet)
21 | fv := et.GetFeatureVector()
22 | expect := feature.FeatureVector{
23 | "BIAS",
24 | "LCSLenFeature:25",
25 | "CleanedLCSLenFeature:25",
26 | "LCSRatioFeature:0.25",
27 | "CleanedLCSRatioFeature:0.25",
28 | "TextLengthFeature:100",
29 | "CleanedTextLengthFeature:25",
30 | "ScreenNameFeature:syou6162",
31 | "FavoriteCountFeature:10",
32 | "RetweetCountFeature:10",
33 | "AtMarksCountFeature:3",
34 | "HashTagsCountFeature:3",
35 | }
36 | if !reflect.DeepEqual(expect, fv) {
37 | t.Error("feature must be wrong")
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/lib/fetcher/fetcher.go:
--------------------------------------------------------------------------------
1 | package fetcher
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "io/ioutil"
7 | "net/http"
8 | "regexp"
9 | "strings"
10 | "time"
11 |
12 | "net/url"
13 | "unicode/utf8"
14 |
15 | "github.com/PuerkitoBio/goquery"
16 | goose "github.com/syou6162/GoOse"
17 | )
18 |
19 | type Article struct {
20 | Url string
21 | Title string
22 | Description string
23 | OgDescription string
24 | OgType string
25 | OgImage string
26 | Body string
27 | StatusCode int
28 | Favicon string
29 | PublishDate *time.Time
30 | }
31 |
32 | var articleFetcher = http.Client{
33 | Transport: &http.Transport{
34 | MaxIdleConns: 0,
35 | MaxIdleConnsPerHost: 100,
36 | },
37 | Timeout: time.Duration(5 * time.Second),
38 | }
39 |
40 | func updateTitleIfArxiv(article *goose.Article, origUrl string, finalUrl string, html []byte) error {
41 | arxivUrl := "https://arxiv.org/abs/"
42 | if strings.Contains(origUrl, arxivUrl) || strings.Contains(finalUrl, arxivUrl) {
43 | // arxivのhtml内にはtitleタグが複数存在するので、丁寧にタイトルを取得する...
44 | re := regexp.MustCompile(`
(.*?)`)
45 | m := re.FindSubmatch(html)
46 | if len(m) >= 2 {
47 | article.Title = string(m[1])
48 | }
49 | }
50 | return nil
51 | }
52 |
53 | func updateMetaDescriptionIfArxiv(article *goose.Article, origUrl string, finalUrl string, html []byte) error {
54 | arxivUrl := "https://arxiv.org/abs/"
55 | if strings.Contains(origUrl, arxivUrl) || strings.Contains(finalUrl, arxivUrl) {
56 | // article.Docでもいけそうだが、gooseが中で書き換えていてダメ。Documentを作りなおす
57 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(html)))
58 | if err != nil {
59 | return err
60 | }
61 | article.MetaDescription = doc.Find(".abstract").Text()
62 | }
63 | return nil
64 | }
65 |
66 | func removeUtmParams(origUrl string) (string, error) {
67 | u, err := url.Parse(origUrl)
68 | if err != nil {
69 | return origUrl, err
70 | }
71 |
72 | q, err := url.ParseQuery(u.RawQuery)
73 | if err != nil {
74 | return origUrl, err
75 | }
76 |
77 | q.Del("utm_source")
78 | q.Del("utm_medium")
79 | q.Del("utm_campaign")
80 | q.Del("utm_term")
81 | q.Del("utm_content")
82 |
83 | q.Del("gi")
84 |
85 | u.RawQuery = q.Encode()
86 |
87 | return u.String(), nil
88 | }
89 |
90 | func GetArticle(origUrl string) (*Article, error) {
91 | g := goose.New()
92 | resp, err := articleFetcher.Get(origUrl)
93 | if err != nil {
94 | return nil, err
95 | }
96 | if resp.StatusCode == http.StatusFound ||
97 | resp.StatusCode == http.StatusUnauthorized ||
98 | resp.StatusCode == http.StatusForbidden ||
99 | resp.StatusCode == http.StatusNotFound ||
100 | resp.StatusCode == http.StatusGone ||
101 | resp.StatusCode == http.StatusBadGateway ||
102 | resp.StatusCode == http.StatusServiceUnavailable {
103 | return nil, errors.New(fmt.Sprintf("%s: Cannot fetch %s", resp.Status, origUrl))
104 | }
105 | defer resp.Body.Close()
106 |
107 | html, err := ioutil.ReadAll(resp.Body)
108 | if err != nil {
109 | return nil, err
110 | }
111 |
112 | if !utf8.Valid(html) {
113 | return nil, errors.New(fmt.Sprintf("Invalid utf8 document: %s", origUrl))
114 | }
115 |
116 | article, err := g.ExtractFromRawHTML(resp.Request.URL.String(), string(html))
117 | if err != nil {
118 | return nil, err
119 | }
120 |
121 | finalUrl := article.CanonicalLink
122 | if finalUrl == "" {
123 | finalUrl = resp.Request.URL.String()
124 | }
125 |
126 | finalUrl, err = removeUtmParams(finalUrl)
127 | if err != nil {
128 | return nil, err
129 | }
130 |
131 | updateTitleIfArxiv(article, origUrl, finalUrl, html)
132 | updateMetaDescriptionIfArxiv(article, origUrl, finalUrl, html)
133 |
134 | favicon := ""
135 | if u, err := url.Parse(article.MetaFavicon); err == nil {
136 | if u.IsAbs() {
137 | favicon = article.MetaFavicon
138 | }
139 | }
140 |
141 | return &Article{
142 | Url: finalUrl,
143 | Title: article.Title,
144 | Description: article.MetaDescription,
145 | OgDescription: article.MetaOgDescription,
146 | OgType: article.MetaOgType,
147 | OgImage: article.MetaOgImage,
148 | Body: article.CleanedText,
149 | StatusCode: resp.StatusCode,
150 | Favicon: favicon,
151 | PublishDate: article.PublishDate,
152 | }, nil
153 | }
154 |
--------------------------------------------------------------------------------
/lib/fetcher/fetcher_test.go:
--------------------------------------------------------------------------------
1 | package fetcher
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | )
7 |
8 | func TestGetArticle(t *testing.T) {
9 | a, err := GetArticle("https://www.yasuhisay.info/entry/20090516/1242480413")
10 | if err != nil {
11 | t.Error(err.Error())
12 | }
13 |
14 | if a.Title == "" {
15 | t.Error("Title must not be empty")
16 | }
17 | if a.Description == "" {
18 | t.Error("Description must not be empty")
19 | }
20 | if a.OgType != "article" {
21 | t.Error("OgType must be article")
22 | }
23 | if a.StatusCode != 200 {
24 | t.Error("StatusCode must be 200")
25 | }
26 | }
27 |
28 | func TestGetArticleARXIV(t *testing.T) {
29 | a, err := GetArticle("https://arxiv.org/abs/2012.07805")
30 | if err != nil {
31 | t.Error(err.Error())
32 | }
33 |
34 | if a.Title != "[2012.07805] Extracting Training Data from Large Language Models" {
35 | t.Error("Title must not be empty")
36 | }
37 | if a.Description == "" {
38 | t.Error("Description must not be empty")
39 | }
40 | if a.StatusCode != 200 {
41 | t.Error("StatusCode must be 200")
42 | }
43 | }
44 |
45 | func TestGetArticleNotFound(t *testing.T) {
46 | _, err := GetArticle("https://www.yasuhisay.info/entry/NOT_FOUND")
47 | if err == nil {
48 | t.Error("Error should occur")
49 | }
50 | }
51 |
52 | func TestGetArticleWithInvalidEncoding(t *testing.T) {
53 | url := "http://www.atmarkit.co.jp/ait/articles/1702/20/news021.html"
54 | _, err := GetArticle(url)
55 | if err == nil {
56 | t.Error(fmt.Sprintf("Error must occur for this url: %s", url))
57 | }
58 | }
59 |
60 | func TestRemoveUtmParams(t *testing.T) {
61 | before := "https://techplay.jp/event/698349?utm_source=event_698349"
62 | after, err := removeUtmParams(before)
63 | if err != nil {
64 | t.Error(fmt.Sprintf("Error must occur for this url: %s", before))
65 | }
66 | expected := "https://techplay.jp/event/698349"
67 | if expected != after {
68 | t.Errorf("url should be %s, but %s", expected, after)
69 | }
70 | a, err := GetArticle(before)
71 | if expected != a.Url {
72 | t.Errorf("url should be %s, but %s", expected, a.Url)
73 | }
74 | }
75 |
76 | func TestFavicon(t *testing.T) {
77 | url := "https://www.yasuhisay.info/entry/2020/11/22/190000"
78 | a, err := GetArticle(url)
79 | if err != nil {
80 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
81 | }
82 | expectedFaviconPath := "https://www.yasuhisay.info/icon/favicon"
83 | if expectedFaviconPath != a.Favicon {
84 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
85 | }
86 |
87 | url = "https://www.lifehacker.jp/2018/11/amazon-impact-absorption-case.html"
88 | a, err = GetArticle(url)
89 | if err != nil {
90 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
91 | }
92 | expectedFaviconPath = "https://www.lifehacker.jp/assets/common/img/favicon.ico"
93 | if expectedFaviconPath != a.Favicon {
94 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
95 | }
96 |
97 | url = "https://peterroelants.github.io/"
98 | a, err = GetArticle(url)
99 | if err != nil {
100 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
101 | }
102 | expectedFaviconPath = "https://peterroelants.github.io/images/favicon/apple-icon-57x57.png"
103 | if expectedFaviconPath != a.Favicon {
104 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
105 | }
106 |
107 | url = "https://www.getrevue.co/profile/icoxfog417/issues/weekly-machine-learning-79-121292"
108 | a, err = GetArticle(url)
109 | if err != nil {
110 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
111 | }
112 | expectedFaviconPath = "https://d3jbm9h03wxzi9.cloudfront.net/assets/favicon-84fc7f228d52c2410eb7aa839e279caeaa491588c7c75229ed33e1c7f69fe75d.ico"
113 | if expectedFaviconPath != a.Favicon {
114 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
115 | }
116 |
117 | url = "https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html"
118 | a, err = GetArticle(url)
119 | if err != nil {
120 | t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
121 | }
122 | expectedFaviconPath = "https://ai.googleblog.com/favicon.ico"
123 | if expectedFaviconPath != a.Favicon {
124 | t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
125 | }
126 | }
127 |
128 | func TestGetPublishDate(t *testing.T) {
129 | a, err := GetArticle("https://www.yasuhisay.info/entry/2019/11/18/153000")
130 | if err != nil {
131 | t.Error("Error should not occur")
132 | }
133 | if a.PublishDate == nil {
134 | t.Error("PublishDate must not be nil")
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/lib/hatena_bookmark/hatena_bookmark.go:
--------------------------------------------------------------------------------
1 | package hatena_bookmark
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "io/ioutil"
7 | "net/http"
8 |
9 | "github.com/syou6162/go-active-learning/lib/model"
10 | )
11 |
12 | func GetHatenaBookmark(url string) (*model.HatenaBookmark, error) {
13 | // ref: http://developer.hatena.ne.jp/ja/documents/bookmark/apis/getinfo
14 | res, err := http.Get(fmt.Sprintf("https://b.hatena.ne.jp/entry/jsonlite/?url=%s", url))
15 | if err != nil {
16 | return nil, err
17 | }
18 | if res.StatusCode != http.StatusOK {
19 | return nil, fmt.Errorf("error: %d", res.StatusCode)
20 | }
21 |
22 | defer res.Body.Close()
23 | body, error := ioutil.ReadAll(res.Body)
24 | if error != nil {
25 | return nil, err
26 | }
27 |
28 | bookmarks := model.HatenaBookmark{}
29 | err = json.Unmarshal(body, &bookmarks)
30 | if error != nil {
31 | return nil, err
32 | }
33 | return &bookmarks, nil
34 | }
35 |
--------------------------------------------------------------------------------
/lib/hatena_bookmark/hatena_bookmark_test.go:
--------------------------------------------------------------------------------
1 | package hatena_bookmark
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestGetHatenaBookmark(t *testing.T) {
8 | bookmarks, err := GetHatenaBookmark("https://www.yasuhisay.info")
9 | if err != nil {
10 | t.Error(err.Error())
11 | }
12 |
13 | if bookmarks.Title == "" {
14 | t.Error("Title must not be empty")
15 | }
16 | if bookmarks.Count == 0 {
17 | t.Error("Count must not be 0")
18 | }
19 | if len(bookmarks.Bookmarks) == 0 {
20 | t.Error("Count must not be 0")
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/lib/model/error.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | type notFoundError string
4 |
5 | func (err notFoundError) Error() string {
6 | return string(err) + " not found"
7 | }
8 |
9 | func NotFoundError(typ string) error {
10 | return notFoundError(typ)
11 | }
12 |
13 | func IsNotFound(err error) bool {
14 | _, ok := err.(notFoundError)
15 | return ok
16 | }
17 |
--------------------------------------------------------------------------------
/lib/model/example.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import (
4 | "math"
5 | "strings"
6 | "time"
7 |
8 | "github.com/syou6162/go-active-learning/lib/feature"
9 | )
10 |
11 | type Example struct {
12 | Id int `db:"id"`
13 | Label LabelType `json:"Label" db:"label"`
14 | Fv feature.FeatureVector
15 | Url string `json:"Url" db:"url"`
16 | FinalUrl string `json:"FinalUrl" db:"final_url"`
17 | Title string `json:"Title" db:"title"`
18 | Description string `json:"Description" db:"description"`
19 | OgDescription string `json:"OgDescription" db:"og_description"`
20 | OgType string `json:"OgType" db:"og_type"`
21 | OgImage string `json:"OgImage" db:"og_image"`
22 | Body string `json:"Body" db:"body"`
23 | Score float64 `db:"score"`
24 | IsNew bool `db:"is_new"`
25 | StatusCode int `json:"StatusCode" db:"status_code"`
26 | Favicon string `json:"Favicon" db:"favicon"`
27 | ErrorCount int `json:"ErrorCount" db:"error_count"`
28 | CreatedAt time.Time `json:"CreatedAt" db:"created_at"`
29 | UpdatedAt time.Time `json:"UpdatedAt" db:"updated_at"`
30 | ReferringTweets *ReferringTweets `json:"ReferringTweets"`
31 | HatenaBookmark *HatenaBookmark `json:"HatenaBookmark"`
32 | }
33 |
34 | type Examples []*Example
35 |
36 | func (example *Example) GetLabel() LabelType {
37 | return example.Label
38 | }
39 |
40 | func (example *Example) GetFeatureVector() feature.FeatureVector {
41 | return example.Fv
42 | }
43 |
44 | func (example *Example) Annotate(label LabelType) {
45 | example.Label = label
46 | }
47 |
48 | func (example *Example) IsLabeled() bool {
49 | return example.Label != UNLABELED
50 | }
51 |
52 | func (example *Example) IsTwitterUrl() bool {
53 | twitterUrl := "https://twitter.com"
54 | return strings.Contains(example.Url, twitterUrl) || strings.Contains(example.FinalUrl, twitterUrl)
55 | }
56 |
57 | func (example *Example) IsArticle() bool {
58 | // twitterはarticleと返ってくるが除外
59 | return example.OgType == "article" && !example.IsTwitterUrl()
60 | }
61 |
62 | func (slice Examples) Len() int {
63 | return len(slice)
64 | }
65 |
66 | func (slice Examples) Less(i, j int) bool {
67 | return math.Abs(slice[i].Score) < math.Abs(slice[j].Score)
68 | }
69 |
70 | func (slice Examples) Swap(i, j int) {
71 | slice[i], slice[j] = slice[j], slice[i]
72 | }
73 |
--------------------------------------------------------------------------------
/lib/model/hatena_bookmark.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import (
4 | "database/sql/driver"
5 | "encoding/json"
6 | "strings"
7 | "time"
8 | )
9 |
10 | type Tags []string
11 |
12 | type HatenaBookmarkTime struct {
13 | *time.Time
14 | }
15 |
16 | // ref: https://dev.classmethod.jp/go/struct-json/
17 | func (hbt *HatenaBookmarkTime) UnmarshalJSON(data []byte) error {
18 | t, err := time.Parse("\"2006/01/02 15:04\"", string(data))
19 | *hbt = HatenaBookmarkTime{&t}
20 | return err
21 | }
22 |
23 | func (hbt HatenaBookmarkTime) MarshalJSON() ([]byte, error) {
24 | return json.Marshal(hbt.Format("2006/01/02 15:04"))
25 | }
26 |
27 | // ref: https://qiita.com/roothybrid7/items/52623bedb45ff0c26a8a
28 | func (hbt *HatenaBookmarkTime) Scan(value interface{}) error {
29 | v := value.(time.Time)
30 | hbt.Time = &v
31 | return nil
32 | }
33 |
34 | func (hbt HatenaBookmarkTime) Value() (driver.Value, error) {
35 | return *hbt.Time, nil
36 | }
37 |
38 | func (tags *Tags) Scan(value interface{}) error {
39 | s := value.(string)
40 | if s == "" {
41 | *tags = Tags{}
42 | return nil
43 | }
44 | v := strings.Split(s, "\t")
45 | *tags = append(*tags, v...)
46 | return nil
47 | }
48 |
49 | func (tags Tags) Value() (driver.Value, error) {
50 | return strings.Join(tags, "\t"), nil
51 | }
52 |
53 | type Bookmark struct {
54 | HatenaBookmarkId int `db:"hatena_bookmark_id"`
55 | Timestamp HatenaBookmarkTime `json:"timestamp" db:"timestamp"`
56 | User string `json:"user" db:"user"`
57 | Tags Tags `json:"tags" db:"tags"`
58 | Comment string `json:"comment" db:"comment"`
59 | }
60 |
61 | type HatenaBookmark struct {
62 | Id int `db:"id"`
63 | ExampleId int `db:"example_id"`
64 | Title string `json:"title" db:"title"`
65 | Bookmarks []*Bookmark `json:"bookmarks"`
66 | Screenshot string `json:"screenshot" db:"screenshot"`
67 | EntryUrl string `json:"entry_url" db:"entry_url"`
68 | Count int `json:"count" db:"count"`
69 | Url string `json:"url" db:"url"`
70 | EId string `json:"eid" db:"eid"`
71 | }
72 |
73 | func (bookmarks *HatenaBookmark) MarshalBinary() ([]byte, error) {
74 | json, err := json.Marshal(bookmarks)
75 | if err != nil {
76 | return nil, err
77 | }
78 | return []byte(json), nil
79 | }
80 |
81 | func (bookmarks *HatenaBookmark) UnmarshalBinary(data []byte) error {
82 | err := json.Unmarshal(data, bookmarks)
83 | if err != nil {
84 | return err
85 | }
86 | return nil
87 | }
88 |
--------------------------------------------------------------------------------
/lib/model/label_type.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import (
4 | "encoding/json"
5 | )
6 |
7 | type LabelType int
8 |
9 | func (lt *LabelType) MarshalBinary() ([]byte, error) {
10 | return json.Marshal(lt)
11 | }
12 |
13 | func (lt *LabelType) UnmarshalBinary(data []byte) error {
14 | if err := json.Unmarshal(data, <); err != nil {
15 | return err
16 | }
17 | return nil
18 | }
19 |
20 | const (
21 | POSITIVE LabelType = 1
22 | NEGATIVE LabelType = -1
23 | UNLABELED LabelType = 0
24 | )
25 |
--------------------------------------------------------------------------------
/lib/model/recommendation.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import "fmt"
4 |
5 | type RecommendationListType int
6 |
7 | const (
8 | GENERAL RecommendationListType = 0
9 | ARTICLE RecommendationListType = 1
10 | GITHUB RecommendationListType = 2
11 | SLIDE RecommendationListType = 3
12 | ARXIV RecommendationListType = 4
13 | VIDEO RecommendationListType = 5
14 | EVENT RecommendationListType = 6
15 | )
16 |
17 | func GetRecommendationListType(listname string) (RecommendationListType, error) {
18 | switch listname {
19 | case "general":
20 | return GENERAL, nil
21 | case "article":
22 | return ARTICLE, nil
23 | case "github":
24 | return GITHUB, nil
25 | case "slide":
26 | return SLIDE, nil
27 | case "arxiv":
28 | return ARXIV, nil
29 | case "video":
30 | return VIDEO, nil
31 | case "event":
32 | return EVENT, nil
33 | default:
34 | return -1, fmt.Errorf("no such RecommendationListType for '%s'", listname)
35 | }
36 | }
37 |
38 | type Recommendation struct {
39 | RecommendationListType RecommendationListType
40 | ExampleIds []int
41 | }
42 |
--------------------------------------------------------------------------------
/lib/model/related_example.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | type RelatedExamples struct {
4 | ExampleId int
5 | RelatedExampleIds []int
6 | }
7 |
--------------------------------------------------------------------------------
/lib/model/tweet.go:
--------------------------------------------------------------------------------
1 | package model
2 |
3 | import (
4 | "time"
5 | )
6 |
7 | type Tweet struct {
8 | Id int `db:"id"`
9 | ExampleId int `db:"example_id"`
10 |
11 | CreatedAt time.Time `json:"CreatedAt" db:"created_at"`
12 | IdStr string `json:"IdStr" db:"id_str"`
13 | FullText string `json:"FullText" db:"full_text"`
14 | FavoriteCount int `json:"FavoriteCount" db:"favorite_count"`
15 | RetweetCount int `json:"RetweetCount" db:"retweet_count"`
16 | Lang string `json:"Lang" db:"lang"`
17 |
18 | ScreenName string `json:"ScreenName" db:"screen_name"`
19 | Name string `json:"Name" db:"name"`
20 | ProfileImageUrl string `json:"ProfileImageUrl" db:"profile_image_url"`
21 | Label LabelType `json:"Label" db:"label"`
22 | Score float64 `json:"Score" db:"score"`
23 | }
24 |
25 | type ReferringTweets struct {
26 | Count int `json:"Count"`
27 | Tweets []*Tweet `json:"Tweets"`
28 | }
29 |
--------------------------------------------------------------------------------
/lib/related_example/related_example.go:
--------------------------------------------------------------------------------
1 | package related_example
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "strconv"
7 | "strings"
8 |
9 | "os"
10 |
11 | "github.com/syou6162/go-active-learning/lib/model"
12 | "github.com/syou6162/go-active-learning/lib/service"
13 | "github.com/urfave/cli"
14 | )
15 |
16 | func parseLine(line string) (int, int, error) {
17 | tokens := strings.Split(line, "\t")
18 | if len(tokens) == 2 {
19 | exampleId, _ := strconv.ParseInt(tokens[0], 10, 0)
20 | relatedExampleId, _ := strconv.ParseInt(tokens[1], 10, 0)
21 | return int(exampleId), int(relatedExampleId), nil
22 | }
23 | return 0, 0, fmt.Errorf("Invalid line: %s", line)
24 | }
25 |
26 | func readRelatedExamples(filename string) ([]*model.RelatedExamples, error) {
27 | fp, err := os.Open(filename)
28 | defer fp.Close()
29 | if err != nil {
30 | return nil, err
31 | }
32 |
33 | exampleId2RelatedExampleIds := make(map[int][]int)
34 | scanner := bufio.NewScanner(fp)
35 | for scanner.Scan() {
36 | line := scanner.Text()
37 | exampleId, relatedExampleId, err := parseLine(line)
38 | if err != nil {
39 | return nil, err
40 | }
41 | if _, ok := exampleId2RelatedExampleIds[exampleId]; ok {
42 | exampleId2RelatedExampleIds[exampleId] = append(exampleId2RelatedExampleIds[exampleId], relatedExampleId)
43 | } else {
44 | exampleId2RelatedExampleIds[exampleId] = []int{relatedExampleId}
45 | }
46 | }
47 | if err := scanner.Err(); err != nil {
48 | return nil, err
49 | }
50 | result := make([]*model.RelatedExamples, 0)
51 | for exampleId, relatedExampleIds := range exampleId2RelatedExampleIds {
52 | result = append(result, &model.RelatedExamples{ExampleId: exampleId, RelatedExampleIds: relatedExampleIds})
53 | }
54 | return result, nil
55 | }
56 |
57 | func doAddRelatedExamples(c *cli.Context) error {
58 | inputFilename := c.String("input-filename")
59 |
60 | if inputFilename == "" {
61 | _ = cli.ShowCommandHelp(c, "add-related-examples")
62 | return cli.NewExitError("`input-filename` is a required field.", 1)
63 | }
64 |
65 | app, err := service.NewDefaultApp()
66 | if err != nil {
67 | return err
68 | }
69 | defer app.Close()
70 |
71 | relatedExamplesList, err := readRelatedExamples(inputFilename)
72 | if err != nil {
73 | return err
74 | }
75 | for _, relatedExamples := range relatedExamplesList {
76 | for _, related := range relatedExamples.RelatedExampleIds {
77 | fmt.Print(relatedExamples.ExampleId)
78 | fmt.Print("\t")
79 | fmt.Println(related)
80 | }
81 | err := app.UpdateRelatedExamples(*relatedExamples)
82 | if err != nil {
83 | return err
84 | }
85 | }
86 | return nil
87 | }
88 |
89 | var CommandAddRelatedExamples = cli.Command{
90 | Name: "add-related-examples",
91 | Usage: "add related examples",
92 | Description: `
93 | Add related examples.
94 | `,
95 | Action: doAddRelatedExamples,
96 | Flags: []cli.Flag{
97 | cli.StringFlag{Name: "input-filename"},
98 | },
99 | }
100 |
--------------------------------------------------------------------------------
/lib/repository/example.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "bufio"
5 | "database/sql"
6 | "fmt"
7 | "io"
8 | "time"
9 |
10 | "github.com/lib/pq"
11 | "github.com/syou6162/go-active-learning/lib/feature"
12 | "github.com/syou6162/go-active-learning/lib/model"
13 | "github.com/syou6162/go-active-learning/lib/util/file"
14 | )
15 |
16 | var exampleNotFoundError = model.NotFoundError("example")
17 |
18 | // データが存在しなければ追加
19 | // データが存在する場合は、以下の場合にのみ更新する
20 | // - ラベルが正例か負例に変更された
21 | // - クロール対象のサイトが一時的に200以外のステータスで前回データが取得できなかった
22 | func (r *repository) UpdateOrCreateExample(e *model.Example) error {
23 | now := time.Now()
24 | e.UpdatedAt = now
25 | _, err := r.db.NamedExec(`
26 | INSERT INTO example
27 | ( url, final_url, title, description, og_description, og_type, og_image, body, score, is_new, status_code, favicon, label, created_at, updated_at)
28 | VALUES
29 | (:url, :final_url, :title, :description, :og_description, :og_type, :og_image, :body, :score, :is_new, :status_code, :favicon, :label, :created_at, :updated_at)
30 | ON CONFLICT (url)
31 | DO UPDATE SET
32 | url = :url, final_url = :final_url, title = :title,
33 | description = :description, og_description = :og_description, og_type = :og_type, og_image = :og_image,
34 | body = :body, score = :score, is_new = :is_new, status_code = :status_code, favicon = :favicon,
35 | label = :label, created_at = :created_at, updated_at = :updated_at
36 | WHERE
37 | ((EXCLUDED.label != 0) AND (example.label != EXCLUDED.label)) OR
38 | ((example.status_code != 200) AND (EXCLUDED.status_code = 200))
39 | ;`, e)
40 | if err != nil {
41 | return err
42 | }
43 | tmp, err := r.FindExampleByUlr(e.Url)
44 | if err != nil {
45 | return err
46 | }
47 | e.Id = tmp.Id
48 | return nil
49 | }
50 |
51 | func (r *repository) UpdateScore(e *model.Example) error {
52 | if _, err := r.FindExampleByUlr(e.Url); err != nil {
53 | return err
54 | }
55 | if _, err := r.db.Exec(`UPDATE example SET score = $1, updated_at = $2 WHERE url = $3;`, e.Score, time.Now(), e.Url); err != nil {
56 | return err
57 | }
58 | return nil
59 | }
60 |
61 | func (r *repository) IncErrorCount(e *model.Example) error {
62 | errorCount, err := r.GetErrorCount(e)
63 | if err != nil {
64 | return err
65 | }
66 | if _, err := r.db.Exec(`UPDATE example SET error_count = $1, updated_at = $2 WHERE url = $3;`, errorCount+1, time.Now(), e.Url); err != nil {
67 | return err
68 | }
69 | return nil
70 | }
71 |
72 | func (r *repository) GetErrorCount(e *model.Example) (int, error) {
73 | example, err := r.FindExampleByUlr(e.Url)
74 | if err != nil {
75 | if err == exampleNotFoundError {
76 | return 0, nil
77 | }
78 | return 0, err
79 | }
80 | return example.ErrorCount, nil
81 | }
82 |
83 | func (r *repository) UpdateFeatureVector(e *model.Example) error {
84 | tmp, err := r.FindExampleByUlr(e.Url)
85 | if err != nil {
86 | return err
87 | }
88 | id := tmp.Id
89 | if _, err = r.db.Exec(`DELETE FROM feature WHERE example_id = $1;`, id); err != nil {
90 | return err
91 | }
92 | _, err = r.db.Exec(`INSERT INTO feature (example_id, feature) VALUES ($1, unnest(cast($2 AS TEXT[])));`, id, pq.Array(e.Fv))
93 | return err
94 | }
95 |
96 | func (r *repository) InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) {
97 | line := scanner.Text()
98 | e, err := file.ParseLine(line)
99 | if err != nil {
100 | return nil, err
101 | }
102 | err = r.UpdateOrCreateExample(e)
103 | if err != nil {
104 | return nil, err
105 | }
106 | return e, nil
107 | }
108 |
109 | func (r *repository) InsertExamplesFromReader(reader io.Reader) error {
110 | scanner := bufio.NewScanner(reader)
111 |
112 | for scanner.Scan() {
113 | _, err := r.InsertExampleFromScanner(scanner)
114 | if err != nil {
115 | return err
116 | }
117 | }
118 | if err := scanner.Err(); err != nil {
119 | return err
120 | }
121 | return nil
122 | }
123 |
124 | func (r *repository) searchExamples(query string, args ...interface{}) (model.Examples, error) {
125 | examples := model.Examples{}
126 | err := r.db.Select(&examples, query, args...)
127 | if err != nil {
128 | return nil, err
129 | }
130 | return examples, nil
131 | }
132 |
133 | func (r *repository) findExample(query string, args ...interface{}) (*model.Example, error) {
134 | e := model.Example{}
135 |
136 | err := r.db.Get(&e, query, args...)
137 | if err != nil {
138 | if err == sql.ErrNoRows {
139 | return nil, exampleNotFoundError
140 | }
141 | return nil, err
142 | }
143 | return &e, nil
144 | }
145 |
146 | func (r *repository) SearchExamples() (model.Examples, error) {
147 | query := `SELECT * FROM example;`
148 | return r.searchExamples(query)
149 | }
150 |
151 | func (r *repository) SearchRecentExamples(from time.Time, limit int) (model.Examples, error) {
152 | query := `SELECT * FROM example WHERE created_at > $1 ORDER BY updated_at DESC LIMIT $2;`
153 | return r.searchExamples(query, from, limit)
154 | }
155 |
156 | func (r *repository) SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) {
157 | query := `SELECT * FROM example WHERE final_url like $1 || '%' AND created_at > $2 ORDER BY updated_at DESC LIMIT $3;`
158 | return r.searchExamples(query, host, from, limit)
159 | }
160 |
161 | func (r *repository) SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) {
162 | query := `SELECT * FROM example WHERE label = $1 ORDER BY updated_at DESC LIMIT $2;`
163 | return r.searchExamples(query, label, limit)
164 | }
165 |
166 | func (r *repository) SearchLabeledExamples(limit int) (model.Examples, error) {
167 | query := `SELECT * FROM example WHERE label != 0 ORDER BY updated_at DESC LIMIT $1;`
168 | return r.searchExamples(query, limit)
169 | }
170 |
171 | func (r *repository) SearchPositiveExamples(limit int) (model.Examples, error) {
172 | return r.SearchExamplesByLabel(model.POSITIVE, limit)
173 | }
174 |
175 | func (r *repository) SearchNegativeExamples(limit int) (model.Examples, error) {
176 | return r.SearchExamplesByLabel(model.NEGATIVE, limit)
177 | }
178 |
179 | func (r *repository) SearchUnlabeledExamples(limit int) (model.Examples, error) {
180 | return r.SearchExamplesByLabel(model.UNLABELED, limit)
181 | }
182 |
183 | func (r *repository) SearchPositiveScoredExamples(limit int) (model.Examples, error) {
184 | query := `SELECT * FROM example WHERE score > 0 ORDER BY updated_at DESC LIMIT $1;`
185 | return r.searchExamples(query, limit)
186 | }
187 |
188 | func (r *repository) FindExampleByUlr(url string) (*model.Example, error) {
189 | query := `SELECT * FROM example WHERE url = $1;`
190 | return r.findExample(query, url)
191 | }
192 |
193 | // bodyなどは極めて長くなりえるので、DB側で絞って返すことができるようにする
194 | func buildSelectQuery(useTruncatedField bool) string {
195 | title := "title"
196 | description := "description"
197 | ogDescription := "og_description"
198 | body := "body"
199 |
200 | if useTruncatedField {
201 | title = "LEFT(title, 200) AS title"
202 | description = "LEFT(description, 1000) AS description"
203 | ogDescription = "LEFT(og_description, 1000) AS og_description"
204 | body = "LEFT(body, 1000) AS body"
205 | }
206 | return fmt.Sprintf("SELECT id, label, url, final_url, %s, %s, %s, og_type, og_image, %s, score, is_new, status_code, favicon, error_count, created_at, updated_at", title, description, ogDescription, body)
207 | }
208 |
209 | func (r *repository) FindExampleById(id int) (*model.Example, error) {
210 | query := fmt.Sprintf(`%s FROM example WHERE id = $1;`, buildSelectQuery(true))
211 | return r.findExample(query, id)
212 | }
213 |
214 | func (r *repository) SearchExamplesByUlrs(urls []string) (model.Examples, error) {
215 | // ref: https://godoc.org/github.com/lib/pq#Array
216 | query := `SELECT * FROM example WHERE url = ANY($1);`
217 | return r.searchExamples(query, pq.Array(urls))
218 | }
219 |
220 | func (r *repository) SearchExamplesByIds(ids []int) (model.Examples, error) {
221 | if len(ids) == 0 {
222 | return model.Examples{}, nil
223 | }
224 | query := fmt.Sprintf(`%s FROM example WHERE id = ANY($1);`, buildSelectQuery(true))
225 | return r.searchExamples(query, pq.Array(ids))
226 | }
227 |
228 | func (r *repository) SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) {
229 | if len(keywords) == 0 {
230 | return model.Examples{}, nil
231 | }
232 | regexList := make([]string, 0)
233 | for _, w := range keywords {
234 | regexList = append(regexList, fmt.Sprintf(`.*%s.*`, w))
235 | }
236 | query := fmt.Sprintf(`%s FROM example WHERE title ~* %s($1) AND label != -1 ORDER BY (label, score) DESC LIMIT $2;`, buildSelectQuery(true), aggregator)
237 | return r.searchExamples(query, pq.Array(regexList), limit)
238 | }
239 |
240 | func (r *repository) countExamplesByLabel(label model.LabelType) (int, error) {
241 | cnt := 0
242 | err := r.db.Get(&cnt, `SELECT COUNT(*) FROM example WHERE label = $1`, label)
243 | if err != nil {
244 | return 0, err
245 | }
246 | return cnt, nil
247 | }
248 |
249 | func (r *repository) CountPositiveExamples() (int, error) {
250 | return r.countExamplesByLabel(model.POSITIVE)
251 | }
252 |
253 | func (r *repository) CountNegativeExamples() (int, error) {
254 | return r.countExamplesByLabel(model.NEGATIVE)
255 | }
256 |
257 | func (r *repository) CountUnlabeledExamples() (int, error) {
258 | return r.countExamplesByLabel(model.UNLABELED)
259 | }
260 |
261 | func (r *repository) FindFeatureVector(e *model.Example) (feature.FeatureVector, error) {
262 | fv := feature.FeatureVector{}
263 | tmp, err := r.FindExampleByUlr(e.Url)
264 | if err != nil {
265 | return fv, err
266 | }
267 | id := tmp.Id
268 | query := `SELECT feature FROM feature WHERE example_id = $1;`
269 | err = r.db.Select(&fv, query, id)
270 | if err != nil {
271 | return fv, err
272 | }
273 | return fv, nil
274 | }
275 |
276 | func (r *repository) SearchFeatureVector(examples model.Examples) (map[int]feature.FeatureVector, error) {
277 | type Pair struct {
278 | ExampleId int `db:"example_id"`
279 | Feature string `db:"feature"`
280 | }
281 |
282 | fvById := make(map[int]feature.FeatureVector)
283 | urls := make([]string, 0)
284 | for _, e := range examples {
285 | urls = append(urls, e.Url)
286 | }
287 |
288 | tmp, err := r.SearchExamplesByUlrs(urls)
289 | if err != nil {
290 | return fvById, err
291 | }
292 | ids := make([]int, 0)
293 | for _, e := range tmp {
294 | ids = append(ids, e.Id)
295 | }
296 |
297 | query := `SELECT example_id, feature FROM feature WHERE example_id = ANY($1);`
298 | pairs := make([]Pair, 0)
299 | err = r.db.Select(&pairs, query, pq.Array(ids))
300 | if err != nil {
301 | return fvById, err
302 | }
303 |
304 | for _, pair := range pairs {
305 | fvById[pair.ExampleId] = append(fvById[pair.ExampleId], pair.Feature)
306 | }
307 | return fvById, nil
308 | }
309 |
310 | func (r *repository) DeleteAllExamples() error {
311 | _, err := r.db.Exec(`DELETE FROM example;`)
312 | return err
313 | }
314 |
--------------------------------------------------------------------------------
/lib/repository/example_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "log"
5 | "os"
6 | "testing"
7 | "time"
8 |
9 | "github.com/syou6162/go-active-learning/lib/example"
10 | "github.com/syou6162/go-active-learning/lib/feature"
11 | "github.com/syou6162/go-active-learning/lib/model"
12 | "github.com/syou6162/go-active-learning/lib/repository"
13 | )
14 |
15 | func TestMain(m *testing.M) {
16 | repo, err := repository.New()
17 | if err != nil {
18 | log.Fatal(err.Error())
19 | }
20 | defer repo.Close()
21 |
22 | ret := m.Run()
23 | os.Exit(ret)
24 | }
25 |
26 | func TestPing(t *testing.T) {
27 | repo, err := repository.New()
28 | if err != nil {
29 | t.Errorf(err.Error())
30 | }
31 | defer repo.Close()
32 |
33 | if err := repo.Ping(); err != nil {
34 | t.Errorf(err.Error())
35 | }
36 | }
37 |
38 | func TestInsertExamplesFromReader(t *testing.T) {
39 | repo, err := repository.New()
40 | if err != nil {
41 | t.Errorf(err.Error())
42 | }
43 | defer repo.Close()
44 |
45 | if err = repo.DeleteAllExamples(); err != nil {
46 | t.Error(err)
47 | }
48 |
49 | fp, err := os.Open("../../tech_input_example.txt")
50 | defer fp.Close()
51 | if err != nil {
52 | t.Error(err)
53 | }
54 | repo.InsertExamplesFromReader(fp)
55 |
56 | examples, err := repo.SearchExamples()
57 | if err != nil {
58 | t.Error(err)
59 | }
60 | if len(examples) == 0 {
61 | t.Errorf("len(examples) > 0, but %d", len(examples))
62 | }
63 | }
64 |
65 | func TestInsertOrUpdateExample(t *testing.T) {
66 | repo, err := repository.New()
67 | if err != nil {
68 | t.Errorf(err.Error())
69 | }
70 | defer repo.Close()
71 |
72 | if err = repo.DeleteAllExamples(); err != nil {
73 | t.Error(err)
74 | }
75 |
76 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.UNLABELED))
77 | if err != nil {
78 | t.Error(err)
79 | }
80 |
81 | examples, err := repo.SearchExamples()
82 | if err != nil {
83 | t.Error(err)
84 | }
85 | if len(examples) != 1 {
86 | t.Errorf("len(examples) == %d, want 1", len(examples))
87 | }
88 | if examples[0].Label != model.UNLABELED {
89 | t.Errorf("label == %d, want 0", examples[0].Label)
90 | }
91 | if examples[0].Id == 0 {
92 | t.Error("id must not be 0")
93 | }
94 |
95 | // same url
96 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.NEGATIVE))
97 | if err != nil {
98 | t.Error(err)
99 | }
100 |
101 | examples, err = repo.SearchExamples()
102 | if err != nil {
103 | t.Error(err)
104 | }
105 | if len(examples) != 1 {
106 | t.Errorf("len(examples) == %d, want 1", len(examples))
107 | }
108 | if examples[0].Label != model.NEGATIVE {
109 | t.Errorf("label == %d, want -1", examples[0].Label)
110 | }
111 |
112 | // same url but different label
113 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.POSITIVE))
114 | if err != nil {
115 | t.Error(err)
116 | }
117 |
118 | examples, err = repo.SearchExamples()
119 | if err != nil {
120 | t.Error(err)
121 | }
122 | if len(examples) != 1 {
123 | t.Errorf("len(examples) == %d, want 1", len(examples))
124 | }
125 | if examples[0].Label != model.POSITIVE {
126 | t.Errorf("label == %d, want 1", examples[0].Label)
127 | }
128 |
129 | // cannot update to unlabeled
130 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.UNLABELED))
131 | if err != nil {
132 | t.Error(err)
133 | }
134 |
135 | examples, err = repo.SearchExamples()
136 | if err != nil {
137 | t.Error(err)
138 | }
139 | if len(examples) != 1 {
140 | t.Errorf("len(examples) == %d, want 1", len(examples))
141 | }
142 | if examples[0].Label != model.POSITIVE {
143 | t.Errorf("label == %d, want 1", examples[0].Label)
144 | }
145 |
146 | // different url
147 | err = repo.UpdateOrCreateExample(example.NewExample("http://another.com", model.NEGATIVE))
148 | if err != nil {
149 | t.Error(err)
150 | }
151 |
152 | examples, err = repo.SearchExamples()
153 | if err != nil {
154 | t.Error(err)
155 | }
156 | if len(examples) != 2 {
157 | t.Errorf("len(examples) == %d, want 2", len(examples))
158 | }
159 | }
160 |
161 | func TestUpdateScore(t *testing.T) {
162 | repo, err := repository.New()
163 | if err != nil {
164 | t.Errorf(err.Error())
165 | }
166 | defer repo.Close()
167 |
168 | if err = repo.DeleteAllExamples(); err != nil {
169 | t.Error(err)
170 | }
171 |
172 | url := "http://hoge.com"
173 | e := example.NewExample(url, model.UNLABELED)
174 | e.Score = 1.0
175 | err = repo.UpdateOrCreateExample(e)
176 | if err != nil {
177 | t.Error(err)
178 | }
179 |
180 | e, err = repo.FindExampleByUlr(url)
181 | if err != nil {
182 | t.Error(err)
183 | }
184 | if e.Score != 1.0 {
185 | t.Errorf("e.Score == %f, want 1.0", e.Score)
186 | }
187 |
188 | e.Score = 100.0
189 | err = repo.UpdateScore(e)
190 | if err != nil {
191 | t.Error(err)
192 | }
193 |
194 | e, err = repo.FindExampleByUlr(url)
195 | if err != nil {
196 | t.Error(err)
197 | }
198 | if e.Score != 100.0 {
199 | t.Errorf("e.Score == %f, want 100.0", e.Score)
200 | }
201 | }
202 |
203 | func TestErrorCount(t *testing.T) {
204 | repo, err := repository.New()
205 | if err != nil {
206 | t.Errorf(err.Error())
207 | }
208 | defer repo.Close()
209 |
210 | if err = repo.DeleteAllExamples(); err != nil {
211 | t.Error(err)
212 | }
213 |
214 | existingUrl := example.NewExample("https://github.com", model.POSITIVE)
215 | nonExistingUrl := example.NewExample("http://hoge.fuga", model.NEGATIVE)
216 | examples := model.Examples{existingUrl, nonExistingUrl}
217 |
218 | for _, e := range examples {
219 | if err := repo.UpdateOrCreateExample(e); err != nil {
220 | t.Error(err)
221 | }
222 |
223 | cnt, err := repo.GetErrorCount(e)
224 | if err != nil {
225 | t.Errorf("Cannot get error count: %s", err.Error())
226 | }
227 | if cnt != 0 {
228 | t.Errorf("Error count must be 0 for %s", e.Url)
229 | }
230 | }
231 |
232 | for _, e := range examples {
233 | err := repo.IncErrorCount(e)
234 | if err != nil {
235 | t.Errorf("Cannot get error count: %s", err.Error())
236 | }
237 | }
238 |
239 | for _, e := range examples {
240 | cnt, err := repo.GetErrorCount(e)
241 | if err != nil {
242 | t.Errorf("Cannot get error count: %s", err.Error())
243 | }
244 | if cnt != 1 {
245 | t.Errorf("Error count must be 1 for %s", e.Url)
246 | }
247 | }
248 | }
249 |
250 | func TestReadLabeledExamples(t *testing.T) {
251 | repo, err := repository.New()
252 | if err != nil {
253 | t.Errorf(err.Error())
254 | }
255 | defer repo.Close()
256 |
257 | if err = repo.DeleteAllExamples(); err != nil {
258 | t.Error(err)
259 | }
260 |
261 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
262 | if err != nil {
263 | t.Error(err)
264 | }
265 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
266 | if err != nil {
267 | t.Error(err)
268 | }
269 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
270 | if err != nil {
271 | t.Error(err)
272 | }
273 |
274 | examples, err := repo.SearchLabeledExamples(10)
275 | if err != nil {
276 | t.Error(err)
277 | }
278 | if len(examples) != 2 {
279 | t.Errorf("len(examples) == %d, want 2", len(examples))
280 | }
281 | }
282 |
283 | func TestReadRecentExamples(t *testing.T) {
284 | repo, err := repository.New()
285 | if err != nil {
286 | t.Errorf(err.Error())
287 | }
288 | defer repo.Close()
289 |
290 | if err = repo.DeleteAllExamples(); err != nil {
291 | t.Error(err)
292 | }
293 |
294 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
295 | if err != nil {
296 | t.Error(err)
297 | }
298 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
299 | if err != nil {
300 | t.Error(err)
301 | }
302 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
303 | if err != nil {
304 | t.Error(err)
305 | }
306 |
307 | examples, err := repo.SearchRecentExamples(time.Now().Add(time.Duration(-10)*time.Minute), 10)
308 | if err != nil {
309 | t.Error(err)
310 | }
311 | if len(examples) != 3 {
312 | t.Errorf("len(examples) == %d, want 3", len(examples))
313 | }
314 | }
315 |
316 | func TestReadRecentExamplesByHost(t *testing.T) {
317 | repo, err := repository.New()
318 | if err != nil {
319 | t.Errorf(err.Error())
320 | }
321 | defer repo.Close()
322 |
323 | if err = repo.DeleteAllExamples(); err != nil {
324 | t.Error(err)
325 | }
326 |
327 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
328 | if err != nil {
329 | t.Error(err)
330 | }
331 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
332 | if err != nil {
333 | t.Error(err)
334 | }
335 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
336 | if err != nil {
337 | t.Error(err)
338 | }
339 |
340 | examples, err := repo.SearchRecentExamplesByHost("http://hoge1.com", time.Now().Add(time.Duration(-10)*time.Minute), 10)
341 | if err != nil {
342 | t.Error(err)
343 | }
344 | if len(examples) != 1 {
345 | t.Errorf("len(examples) == %d, want 1", len(examples))
346 | }
347 | }
348 |
349 | func TestSearchExamplesByUlr(t *testing.T) {
350 | repo, err := repository.New()
351 | if err != nil {
352 | t.Errorf(err.Error())
353 | }
354 | defer repo.Close()
355 |
356 | if err = repo.DeleteAllExamples(); err != nil {
357 | t.Error(err)
358 | }
359 |
360 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.NEGATIVE))
361 | if err != nil {
362 | t.Error(err)
363 | }
364 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
365 | if err != nil {
366 | t.Error(err)
367 | }
368 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
369 | if err != nil {
370 | t.Error(err)
371 | }
372 |
373 | example, err := repo.FindExampleByUlr("http://hoge1.com")
374 | if err != nil {
375 | t.Error(err)
376 | }
377 | if example.Url == "" {
378 | t.Errorf("example.Url == %s, want http://hoge1.com", example.Url)
379 | }
380 |
381 | example, err = repo.FindExampleByUlr("http://hoge4.com")
382 | if err == nil {
383 | t.Errorf("search result must be nil")
384 | }
385 | }
386 |
387 | func TestSearchExamplesByUlrs(t *testing.T) {
388 | repo, err := repository.New()
389 | if err != nil {
390 | t.Errorf(err.Error())
391 | }
392 | defer repo.Close()
393 |
394 | if err = repo.DeleteAllExamples(); err != nil {
395 | t.Error(err)
396 | }
397 |
398 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.NEGATIVE))
399 | if err != nil {
400 | t.Error(err)
401 | }
402 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
403 | if err != nil {
404 | t.Error(err)
405 | }
406 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
407 | if err != nil {
408 | t.Error(err)
409 | }
410 |
411 | examples, err := repo.SearchExamplesByUlrs([]string{"http://hoge1.com", "http://hoge2.com"})
412 | if err != nil {
413 | t.Error(err)
414 | }
415 | if len(examples) != 2 {
416 | t.Errorf("len(examples) == %d, want 2", len(examples))
417 | }
418 | }
419 |
420 | func TestSearchExamplesByLabels(t *testing.T) {
421 | repo, err := repository.New()
422 | if err != nil {
423 | t.Errorf(err.Error())
424 | }
425 | defer repo.Close()
426 |
427 | if err = repo.DeleteAllExamples(); err != nil {
428 | t.Error(err)
429 | }
430 |
431 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
432 | if err != nil {
433 | t.Error(err)
434 | }
435 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
436 | if err != nil {
437 | t.Error(err)
438 | }
439 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
440 | if err != nil {
441 | t.Error(err)
442 | }
443 |
444 | examples, err := repo.SearchPositiveExamples(10)
445 | if err != nil {
446 | t.Error(err)
447 | }
448 | if len(examples) != 1 {
449 | t.Errorf("len(examples) == %d, want 1", len(examples))
450 | }
451 |
452 | examples, err = repo.SearchNegativeExamples(10)
453 | if err != nil {
454 | t.Error(err)
455 | }
456 | if len(examples) != 1 {
457 | t.Errorf("len(examples) == %d, want 1", len(examples))
458 | }
459 |
460 | examples, err = repo.SearchUnlabeledExamples(10)
461 | if err != nil {
462 | t.Error(err)
463 | }
464 | if len(examples) != 1 {
465 | t.Errorf("len(examples) == %d, want 1", len(examples))
466 | }
467 | }
468 |
469 | func TestCountExamplesByLabels(t *testing.T) {
470 | repo, err := repository.New()
471 | if err != nil {
472 | t.Errorf(err.Error())
473 | }
474 | defer repo.Close()
475 |
476 | if err = repo.DeleteAllExamples(); err != nil {
477 | t.Error(err)
478 | }
479 |
480 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
481 | if err != nil {
482 | t.Error(err)
483 | }
484 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
485 | if err != nil {
486 | t.Error(err)
487 | }
488 | err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
489 | if err != nil {
490 | t.Error(err)
491 | }
492 |
493 | cnt, err := repo.CountPositiveExamples()
494 | if err != nil {
495 | t.Error(err)
496 | }
497 | if cnt != 1 {
498 | t.Errorf("len(posExamples) == %d, want 1", cnt)
499 | }
500 |
501 | cnt, err = repo.CountNegativeExamples()
502 | if err != nil {
503 | t.Error(err)
504 | }
505 | if cnt != 1 {
506 | t.Errorf("len(negExamples) == %d, want 1", cnt)
507 | }
508 |
509 | cnt, err = repo.CountUnlabeledExamples()
510 | if err != nil {
511 | t.Error(err)
512 | }
513 | if cnt != 1 {
514 | t.Errorf("len(unlabeledExamples) == %d, want 1", cnt)
515 | }
516 | }
517 |
518 | func TestFeatureVectorReadWrite(t *testing.T) {
519 | repo, err := repository.New()
520 | if err != nil {
521 | t.Errorf(err.Error())
522 | }
523 | defer repo.Close()
524 |
525 | if err = repo.DeleteAllExamples(); err != nil {
526 | t.Error(err)
527 | }
528 |
529 | e1 := example.NewExample("http://hoge.com", model.UNLABELED)
530 | err = repo.UpdateOrCreateExample(e1)
531 | if err != nil {
532 | t.Error(err)
533 | }
534 | e1.Fv = feature.FeatureVector{"BIAS"}
535 |
536 | if err = repo.UpdateFeatureVector(e1); err != nil {
537 | t.Error(err)
538 | }
539 |
540 | fv, err := repo.FindFeatureVector(e1)
541 | if err != nil {
542 | t.Error(err)
543 | }
544 | if len(fv) != 1 {
545 | t.Errorf("len(fv) == %d, want 1", len(fv))
546 | }
547 |
548 | e2 := example.NewExample("http://fuga.com", model.UNLABELED)
549 | err = repo.UpdateOrCreateExample(e2)
550 | if err != nil {
551 | t.Error(err)
552 | }
553 | e2.Fv = feature.FeatureVector{"hoge"}
554 | if err = repo.UpdateFeatureVector(e2); err != nil {
555 | t.Error(err)
556 | }
557 | fvList, err := repo.SearchFeatureVector(model.Examples{e1, e2})
558 | if err != nil {
559 | t.Error(err)
560 | }
561 | if len(fvList) != 2 {
562 | t.Errorf("len(fvList) == %d, want 2", len(fvList))
563 | }
564 | if fvList[e2.Id][0] != "hoge" {
565 | t.Errorf("fvList[e2.Id][0] == %s, want hoge", fvList[e2.Id][0])
566 | }
567 | }
568 |
569 | func TestSearchExamplesByWords(t *testing.T) {
570 | repo, err := repository.New()
571 | if err != nil {
572 | t.Errorf(err.Error())
573 | }
574 | defer repo.Close()
575 |
576 | if err = repo.DeleteAllExamples(); err != nil {
577 | t.Error(err)
578 | }
579 |
580 | e1 := example.NewExample("http://hoge.com", model.UNLABELED)
581 | e1.Title = "日本語"
582 | err = repo.UpdateOrCreateExample(e1)
583 | if err != nil {
584 | t.Error(err)
585 | }
586 |
587 | e2 := example.NewExample("http://fuga.com", model.UNLABELED)
588 | e2.Title = "英語"
589 | err = repo.UpdateOrCreateExample(e2)
590 | if err != nil {
591 | t.Error(err)
592 | }
593 |
594 | examples, err := repo.SearchExamplesByKeywords([]string{"日本語"}, "ALL", 100)
595 | if len(examples) != 1 {
596 | t.Errorf("len(examples) == %d, want 1", len(examples))
597 | }
598 | examples, err = repo.SearchExamplesByKeywords([]string{"語"}, "ALL", 100)
599 | if len(examples) != 2 {
600 | t.Errorf("len(examples) == %d, want 2", len(examples))
601 | }
602 | examples, err = repo.SearchExamplesByKeywords([]string{"日本語", "英語"}, "ALL", 100)
603 | if len(examples) != 0 {
604 | t.Errorf("len(examples) == %d, want 0", len(examples))
605 | }
606 | examples, err = repo.SearchExamplesByKeywords([]string{"日本語", "英語"}, "ANY", 100)
607 | if len(examples) != 2 {
608 | t.Errorf("len(examples) == %d, want 2", len(examples))
609 | }
610 | }
611 |
--------------------------------------------------------------------------------
/lib/repository/hatena_bookmark.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "github.com/lib/pq"
5 | "github.com/syou6162/go-active-learning/lib/model"
6 | )
7 |
8 | var hatenaBookmarkNotFoundError = model.NotFoundError("hatenaBookmark")
9 |
10 | func (r *repository) UpdateHatenaBookmark(e *model.Example) error {
11 | if e.HatenaBookmark == nil || e.HatenaBookmark.Count == 0 {
12 | return nil
13 | }
14 |
15 | tmp, err := r.FindExampleByUlr(e.Url)
16 | if err != nil {
17 | return err
18 | }
19 | id := tmp.Id
20 |
21 | e.HatenaBookmark.ExampleId = id
22 | if _, err = r.db.NamedExec(`
23 | INSERT INTO hatena_bookmark
24 | ( example_id, title, screenshot, entry_url, count, url, eid)
25 | VALUES
26 | (:example_id, :title, :screenshot, :entry_url, :count, :url, :eid)
27 | ON CONFLICT (example_id)
28 | DO UPDATE SET
29 | title = :title, count = :count
30 | ;`, e.HatenaBookmark); err != nil {
31 | return err
32 | }
33 |
34 | hb := model.HatenaBookmark{}
35 | if err = r.db.Get(&hb, `SELECT id FROM hatena_bookmark WHERE example_id = $1;`, id); err != nil {
36 | return err
37 | }
38 |
39 | for _, b := range e.HatenaBookmark.Bookmarks {
40 | b.HatenaBookmarkId = hb.Id
41 | if _, err = r.db.NamedExec(`
42 | INSERT INTO bookmark
43 | (hatena_bookmark_id, "user", comment, timestamp, tags)
44 | VALUES
45 | (:hatena_bookmark_id, :user, :comment, :timestamp, :tags)
46 | ON CONFLICT (hatena_bookmark_id, "user") DO NOTHING
47 | ;`, b); err != nil {
48 | return err
49 | }
50 | }
51 | return nil
52 | }
53 |
54 | func (r *repository) SearchHatenaBookmarks(examples model.Examples, limitForEachExample int) ([]*model.HatenaBookmark, error) {
55 | hatenaBookmarks := make([]*model.HatenaBookmark, 0)
56 | exampleIds := make([]int, 0)
57 | for _, e := range examples {
58 | exampleIds = append(exampleIds, e.Id)
59 | }
60 |
61 | query := `SELECT * FROM hatena_bookmark WHERE example_id = ANY($1);`
62 | err := r.db.Select(&hatenaBookmarks, query, pq.Array(exampleIds))
63 | if err != nil {
64 | return hatenaBookmarks, err
65 | }
66 |
67 | hatenaBookmarkIds := make([]int, 0)
68 | for _, hb := range hatenaBookmarks {
69 | hatenaBookmarkIds = append(hatenaBookmarkIds, hb.Id)
70 | hb.Bookmarks = make([]*model.Bookmark, 0)
71 | }
72 | if limitForEachExample == 0 {
73 | return hatenaBookmarks, nil
74 | }
75 |
76 | bookmarks := make([]*model.Bookmark, 0)
77 | query = `SELECT * FROM bookmark WHERE hatena_bookmark_id = ANY($1) ORDER BY timestamp LIMIT $2;`
78 | err = r.db.Select(&bookmarks, query, pq.Array(hatenaBookmarkIds), limitForEachExample)
79 | if err != nil {
80 | return hatenaBookmarks, err
81 | }
82 |
83 | bookmarksByHatenaBookmarkId := make(map[int][]*model.Bookmark)
84 | for _, b := range bookmarks {
85 | bookmarksByHatenaBookmarkId[b.HatenaBookmarkId] = append(bookmarksByHatenaBookmarkId[b.HatenaBookmarkId], b)
86 | }
87 |
88 | result := make([]*model.HatenaBookmark, 0)
89 | for _, hb := range hatenaBookmarks {
90 | bookmarks := bookmarksByHatenaBookmarkId[hb.Id]
91 | hb.Bookmarks = bookmarks
92 | result = append(result, hb)
93 | }
94 | return result, nil
95 | }
96 |
97 | func (r *repository) FindHatenaBookmark(e *model.Example, limit int) (*model.HatenaBookmark, error) {
98 | hatenaBookmark := &model.HatenaBookmark{}
99 |
100 | query := `SELECT * FROM hatena_bookmark WHERE example_id = $1;`
101 | err := r.db.Get(hatenaBookmark, query, e.Id)
102 | if err != nil {
103 | return hatenaBookmark, err
104 | }
105 |
106 | bookmarks := make([]*model.Bookmark, 0)
107 | if limit == 0 {
108 | hatenaBookmark.Bookmarks = bookmarks
109 | return hatenaBookmark, nil
110 | }
111 |
112 | hatenaBookmarkId := hatenaBookmark.Id
113 | query = `SELECT * FROM bookmark WHERE hatena_bookmark_id = $1 ORDER BY timestamp LIMIT $2;`
114 | err = r.db.Select(&bookmarks, query, hatenaBookmarkId, limit)
115 | if err != nil {
116 | return hatenaBookmark, err
117 | }
118 |
119 | hatenaBookmark.Bookmarks = bookmarks
120 | return hatenaBookmark, nil
121 | }
122 |
--------------------------------------------------------------------------------
/lib/repository/hatena_bookmark_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/syou6162/go-active-learning/lib/example"
8 | "github.com/syou6162/go-active-learning/lib/model"
9 | "github.com/syou6162/go-active-learning/lib/repository"
10 | )
11 |
12 | func TestUpdateHatenaBookmark(t *testing.T) {
13 | repo, err := repository.New()
14 | if err != nil {
15 | t.Errorf(err.Error())
16 | }
17 | defer repo.Close()
18 |
19 | if err = repo.DeleteAllExamples(); err != nil {
20 | t.Error(err)
21 | }
22 |
23 | e := example.NewExample("http://hoge.com", model.UNLABELED)
24 | err = repo.UpdateOrCreateExample(e)
25 | if err != nil {
26 | t.Error(err)
27 | }
28 | now := time.Now()
29 | b1 := model.Bookmark{
30 | User: "syou6162",
31 | Comment: "面白いサイトですね",
32 | Timestamp: model.HatenaBookmarkTime{Time: &now},
33 | Tags: model.Tags{"hack"},
34 | }
35 | hb := model.HatenaBookmark{
36 | ExampleId: e.Id,
37 | Title: "hoge",
38 | Count: 10,
39 | Bookmarks: []*model.Bookmark{&b1},
40 | }
41 | e.HatenaBookmark = &hb
42 | if err = repo.UpdateHatenaBookmark(e); err != nil {
43 | t.Error(err)
44 | }
45 |
46 | {
47 | result, err := repo.SearchHatenaBookmarks(model.Examples{e}, 10)
48 | if err != nil {
49 | t.Error(err)
50 | }
51 |
52 | for _, tmp := range result {
53 | if tmp.Title == "" {
54 | t.Error("Title must not be empty")
55 | }
56 | for _, b := range tmp.Bookmarks {
57 | if b.User == "" {
58 | t.Error("User must not be empty")
59 | }
60 | if len(b.Tags) == 0 {
61 | t.Error("Tags must not be empty")
62 | }
63 | }
64 | }
65 | }
66 |
67 | {
68 | result, err := repo.FindHatenaBookmark(e, 10)
69 | if err != nil {
70 | t.Error(err)
71 | }
72 |
73 | if result.Title == "" {
74 | t.Error("Title must not be empty")
75 | }
76 | for _, b := range result.Bookmarks {
77 | if b.User == "" {
78 | t.Error("User must not be empty")
79 | }
80 | if len(b.Tags) == 0 {
81 | t.Error("Tags must not be empty")
82 | }
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/lib/repository/mira.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "encoding/json"
5 |
6 | "github.com/syou6162/go-active-learning/lib/classifier"
7 | )
8 |
9 | func (r *repository) InsertMIRAModel(m classifier.MIRAClassifier) error {
10 | bytes, err := json.Marshal(m)
11 | if err != nil {
12 | return err
13 | }
14 | query := `INSERT INTO model (model_type, model, c, accuracy, precision, recall, fvalue) VALUES ($1, $2, $3, $4, $5, $6, $7);`
15 | if _, err := r.db.Exec(query, m.ModelType, string(bytes), m.C, m.Accuracy, m.Precision, m.Recall, m.Fvalue); err != nil {
16 | return err
17 | }
18 | return nil
19 | }
20 |
21 | func (r *repository) FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) {
22 | type Classifier struct {
23 | Model string
24 | }
25 | tmp := Classifier{}
26 |
27 | query := `SELECT model FROM model WHERE model_type = $1 ORDER BY created_at DESC LIMIT 1;`
28 | err := r.db.Get(&tmp, query, modelType)
29 | if err != nil {
30 | return nil, err
31 | }
32 |
33 | clf := classifier.MIRAClassifier{}
34 | if err := json.Unmarshal(([]byte)(tmp.Model), &clf); err != nil {
35 | return nil, err
36 | }
37 | return &clf, nil
38 | }
39 |
--------------------------------------------------------------------------------
/lib/repository/mira_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/classifier"
7 | "github.com/syou6162/go-active-learning/lib/repository"
8 | )
9 |
10 | func TestInsertMIRAModel(t *testing.T) {
11 | repo, err := repository.New()
12 | if err != nil {
13 | t.Errorf(err.Error())
14 | }
15 | defer repo.Close()
16 |
17 | weight := make(map[string]float64)
18 | weight["hoge"] = 1.0
19 | weight["fuga"] = 1.0
20 | clf := classifier.MIRAClassifier{classifier.EXAMPLE, weight, 10.0, 0.0, 0.0, 0.0, 0.0}
21 | err = repo.InsertMIRAModel(clf)
22 | if err != nil {
23 | t.Error(err)
24 | }
25 |
26 | {
27 | clf, err := repo.FindLatestMIRAModel(classifier.EXAMPLE)
28 | if err != nil {
29 | t.Error(err)
30 | }
31 | if len(clf.Weight) == 0 {
32 | t.Error("weight must not be empty")
33 | }
34 | if clf.C != 10.0 {
35 | t.Error("C must be 10.0")
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/lib/repository/recommendation.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "github.com/lib/pq"
5 | "github.com/syou6162/go-active-learning/lib/model"
6 | )
7 |
8 | func (r *repository) UpdateRecommendation(rec model.Recommendation) error {
9 | if _, err := r.db.Exec(`DELETE FROM recommendation WHERE list_type = $1;`, rec.RecommendationListType); err != nil {
10 | return err
11 | }
12 | if _, err := r.db.Exec(`INSERT INTO recommendation (list_type, example_id) VALUES ($1, unnest(cast($2 AS INT[])));`, rec.RecommendationListType, pq.Array(rec.ExampleIds)); err != nil {
13 | return err
14 | }
15 | return nil
16 | }
17 |
18 | func (r *repository) FindRecommendation(t model.RecommendationListType) (*model.Recommendation, error) {
19 | rec := &model.Recommendation{RecommendationListType: t}
20 | items := make([]int, 0)
21 | query := `SELECT example_id FROM recommendation WHERE list_type = $1;`
22 | err := r.db.Select(&items, query, t)
23 | if err != nil {
24 | return nil, err
25 | }
26 | rec.ExampleIds = items
27 | return rec, nil
28 | }
29 |
--------------------------------------------------------------------------------
/lib/repository/recommendation_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/example"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | "github.com/syou6162/go-active-learning/lib/repository"
9 | )
10 |
11 | func TestUpdateRecommendation(t *testing.T) {
12 | repo, err := repository.New()
13 | if err != nil {
14 | t.Errorf(err.Error())
15 | }
16 | defer repo.Close()
17 |
18 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
19 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
20 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
21 | examples := model.Examples{e1, e2, e3}
22 | for _, e := range examples {
23 | err = repo.UpdateOrCreateExample(e)
24 | if err != nil {
25 | t.Error(err)
26 | }
27 | }
28 | rec := model.Recommendation{RecommendationListType: model.GENERAL, ExampleIds: []int{e1.Id, e2.Id, e3.Id}}
29 | err = repo.UpdateRecommendation(rec)
30 | if err != nil {
31 | t.Error(err)
32 | }
33 |
34 | {
35 | rec, err := repo.FindRecommendation(model.GENERAL)
36 | if err != nil {
37 | t.Error(err)
38 | }
39 | if len(rec.ExampleIds) != 3 {
40 | t.Error("len(rec.ExampleIds) must be 3")
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/lib/repository/related_example.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "github.com/lib/pq"
5 | "github.com/syou6162/go-active-learning/lib/model"
6 | )
7 |
8 | func (r *repository) UpdateRelatedExamples(related model.RelatedExamples) error {
9 | if _, err := r.db.Exec(`DELETE FROM related_example WHERE example_id = $1;`, related.ExampleId); err != nil {
10 | return err
11 | }
12 | if _, err := r.db.Exec(`INSERT INTO related_example (example_id, related_example_id) VALUES ($1, unnest(cast($2 AS INT[])));`, related.ExampleId, pq.Array(related.RelatedExampleIds)); err != nil {
13 | return err
14 | }
15 | return nil
16 | }
17 |
18 | func (r *repository) FindRelatedExamples(e *model.Example) (*model.RelatedExamples, error) {
19 | related := &model.RelatedExamples{ExampleId: e.Id}
20 | items := make([]int, 0)
21 | query := `SELECT related_example_id FROM related_example WHERE example_id = $1;`
22 | err := r.db.Select(&items, query, e.Id)
23 | if err != nil {
24 | return nil, err
25 | }
26 | related.RelatedExampleIds = items
27 | return related, nil
28 | }
29 |
--------------------------------------------------------------------------------
/lib/repository/related_example_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/example"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | "github.com/syou6162/go-active-learning/lib/repository"
9 | )
10 |
11 | func TestUpdateRelatedExamples(t *testing.T) {
12 | repo, err := repository.New()
13 | if err != nil {
14 | t.Errorf(err.Error())
15 | }
16 | defer repo.Close()
17 |
18 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
19 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
20 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
21 | examples := model.Examples{e1, e2, e3}
22 | for _, e := range examples {
23 | err = repo.UpdateOrCreateExample(e)
24 | if err != nil {
25 | t.Error(err)
26 | }
27 | }
28 | related := model.RelatedExamples{ExampleId: e1.Id, RelatedExampleIds: []int{e2.Id, e3.Id}}
29 | err = repo.UpdateRelatedExamples(related)
30 | if err != nil {
31 | t.Error(err)
32 | }
33 |
34 | {
35 | related, err := repo.FindRelatedExamples(e1)
36 | if err != nil {
37 | t.Error(err)
38 | }
39 | if len(related.RelatedExampleIds) != 2 {
40 | t.Error("len(related.RelatedExampleIds) must be 2")
41 | }
42 | }
43 | {
44 | related, err := repo.FindRelatedExamples(e2)
45 | if err != nil {
46 | t.Error(err)
47 | }
48 | if len(related.RelatedExampleIds) != 0 {
49 | t.Error("len(related.RelatedExampleIds) must be 0")
50 | }
51 | }
52 | }
53 |
54 | func TestUpdateRelatedExamplesMyOwn(t *testing.T) {
55 | repo, err := repository.New()
56 | if err != nil {
57 | t.Errorf(err.Error())
58 | }
59 | defer repo.Close()
60 |
61 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
62 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
63 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
64 | examples := model.Examples{e1, e2, e3}
65 | for _, e := range examples {
66 | err = repo.UpdateOrCreateExample(e)
67 | if err != nil {
68 | t.Error(err)
69 | }
70 | }
71 | related := model.RelatedExamples{ExampleId: e1.Id, RelatedExampleIds: []int{e1.Id, e2.Id, e3.Id}}
72 | err = repo.UpdateRelatedExamples(related)
73 | if err == nil {
74 | t.Error("自身と同一のexample_idを持つ事例はrelated_example_idに追加できない")
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/lib/repository/repository.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "time"
7 |
8 | "github.com/jmoiron/sqlx"
9 |
10 | "bufio"
11 |
12 | _ "github.com/lib/pq"
13 | "github.com/syou6162/go-active-learning/lib/classifier"
14 | "github.com/syou6162/go-active-learning/lib/feature"
15 | "github.com/syou6162/go-active-learning/lib/model"
16 | "github.com/syou6162/go-active-learning/lib/util"
17 | )
18 |
19 | type Repository interface {
20 | UpdateOrCreateExample(e *model.Example) error
21 | UpdateScore(e *model.Example) error
22 | InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error)
23 | InsertExamplesFromReader(reader io.Reader) error
24 | SearchExamples() (model.Examples, error)
25 | SearchRecentExamples(from time.Time, limit int) (model.Examples, error)
26 | SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error)
27 | SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error)
28 | SearchLabeledExamples(limit int) (model.Examples, error)
29 | SearchPositiveExamples(limit int) (model.Examples, error)
30 | SearchNegativeExamples(limit int) (model.Examples, error)
31 | SearchUnlabeledExamples(limit int) (model.Examples, error)
32 | SearchPositiveScoredExamples(limit int) (model.Examples, error)
33 | FindExampleByUlr(url string) (*model.Example, error)
34 | FindExampleById(id int) (*model.Example, error)
35 | SearchExamplesByUlrs(urls []string) (model.Examples, error)
36 | SearchExamplesByIds(ids []int) (model.Examples, error)
37 | SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error)
38 | DeleteAllExamples() error
39 |
40 | CountPositiveExamples() (int, error)
41 | CountNegativeExamples() (int, error)
42 | CountUnlabeledExamples() (int, error)
43 |
44 | IncErrorCount(e *model.Example) error
45 | GetErrorCount(e *model.Example) (int, error)
46 |
47 | UpdateFeatureVector(e *model.Example) error
48 | FindFeatureVector(e *model.Example) (feature.FeatureVector, error)
49 | SearchFeatureVector(examples model.Examples) (map[int]feature.FeatureVector, error)
50 |
51 | UpdateHatenaBookmark(e *model.Example) error
52 | SearchHatenaBookmarks(examples model.Examples, limitForEachExample int) ([]*model.HatenaBookmark, error)
53 | FindHatenaBookmark(e *model.Example, limit int) (*model.HatenaBookmark, error)
54 |
55 | UpdateOrCreateReferringTweets(e *model.Example) error
56 | UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error
57 | SearchReferringTweetsList(examples model.Examples, limit int) (map[int]model.ReferringTweets, error)
58 | SearchReferringTweets(limit int) (model.ReferringTweets, error)
59 | SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
60 | SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
61 | SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
62 | FindReferringTweets(e *model.Example, limit int) (model.ReferringTweets, error)
63 | SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error)
64 |
65 | InsertMIRAModel(m classifier.MIRAClassifier) error
66 | FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error)
67 |
68 | UpdateRecommendation(rec model.Recommendation) error
69 | FindRecommendation(t model.RecommendationListType) (*model.Recommendation, error)
70 |
71 | UpdateRelatedExamples(related model.RelatedExamples) error
72 | FindRelatedExamples(e *model.Example) (*model.RelatedExamples, error)
73 |
74 | UpdateTopAccessedExampleIds([]int) error
75 | SearchTopAccessedExampleIds() ([]int, error)
76 |
77 | Ping() error
78 | Close() error
79 | }
80 |
81 | type repository struct {
82 | db *sqlx.DB
83 | }
84 |
85 | func GetDataSourceName() string {
86 | host := util.GetEnv("POSTGRES_HOST", "localhost")
87 | dbUser := util.GetEnv("DB_USER", "nobody")
88 | dbPassword := util.GetEnv("DB_PASSWORD", "nobody")
89 | dbName := util.GetEnv("DB_NAME", "go-active-learning")
90 | return fmt.Sprintf(
91 | "host=%s user=%s password=%s dbname=%s sslmode=disable",
92 | host, dbUser, dbPassword, dbName,
93 | )
94 | }
95 |
96 | func New() (*repository, error) {
97 | db, err := sqlx.Open("postgres", GetDataSourceName())
98 | if err != nil {
99 | return nil, err
100 | }
101 | db.SetMaxOpenConns(50)
102 | return &repository{db: db}, nil
103 | }
104 |
105 | func (r *repository) Ping() error {
106 | return r.db.Ping()
107 | }
108 |
109 | func (r *repository) Close() error {
110 | if r.db != nil {
111 | return r.db.Close()
112 | } else {
113 | return nil
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/lib/repository/top_accessed_example.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "github.com/lib/pq"
5 | )
6 |
7 | func (r *repository) UpdateTopAccessedExampleIds(exampleIds []int) error {
8 | if _, err := r.db.Exec(`DELETE FROM top_accessed_example;`); err != nil {
9 | return err
10 | }
11 | if _, err := r.db.Exec(`INSERT INTO top_accessed_example (example_id) VALUES (unnest(cast($1 AS INT[])));`, pq.Array(exampleIds)); err != nil {
12 | return err
13 | }
14 | return nil
15 | }
16 |
17 | func (r *repository) SearchTopAccessedExampleIds() ([]int, error) {
18 | exampleIds := make([]int, 0)
19 | query := `SELECT example_id FROM top_accessed_example;`
20 | err := r.db.Select(&exampleIds, query)
21 | if err != nil {
22 | return nil, err
23 | }
24 | return exampleIds, nil
25 | }
26 |
--------------------------------------------------------------------------------
/lib/repository/top_accessed_example_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/example"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | "github.com/syou6162/go-active-learning/lib/repository"
9 | )
10 |
11 | func TestUpdateTopAccessedExampleIds(t *testing.T) {
12 | repo, err := repository.New()
13 | if err != nil {
14 | t.Errorf(err.Error())
15 | }
16 | defer repo.Close()
17 |
18 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
19 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
20 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
21 | examples := model.Examples{e1, e2, e3}
22 | for _, e := range examples {
23 | err = repo.UpdateOrCreateExample(e)
24 | if err != nil {
25 | t.Error(err)
26 | }
27 | }
28 | exampleIds := make([]int, 0)
29 | for _, e := range examples {
30 | exampleIds = append(exampleIds, e.Id)
31 | }
32 | err = repo.UpdateTopAccessedExampleIds(exampleIds)
33 | if err != nil {
34 | t.Error(err)
35 | }
36 |
37 | {
38 | top, err := repo.SearchTopAccessedExampleIds()
39 | if err != nil {
40 | t.Error(err)
41 | }
42 | if len(top) != 3 {
43 | t.Error("len(top) must be 3")
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/lib/repository/tweet.go:
--------------------------------------------------------------------------------
1 | package repository
2 |
3 | import (
4 | "time"
5 |
6 | "github.com/lib/pq"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | )
9 |
10 | func (r *repository) UpdateOrCreateReferringTweets(e *model.Example) error {
11 | if e.ReferringTweets == nil || len((*e).ReferringTweets.Tweets) == 0 || (*e).ReferringTweets.Count == 0 {
12 | return nil
13 | }
14 |
15 | tmp, err := r.FindExampleByUlr(e.Url)
16 | if err != nil {
17 | return err
18 | }
19 | id := tmp.Id
20 |
21 | for _, t := range (*e).ReferringTweets.Tweets {
22 | t.ExampleId = id
23 | if _, err = r.db.NamedExec(`
24 | INSERT INTO tweet
25 | ( example_id, created_at, id_str, full_text, favorite_count, retweet_count, lang, screen_name, name, profile_image_url, label, score)
26 | VALUES
27 | (:example_id, :created_at, :id_str, :full_text, :favorite_count, :retweet_count, :lang, :screen_name, :name, :profile_image_url, :label, :score)
28 | ON CONFLICT (example_id, id_str)
29 | DO UPDATE SET
30 | favorite_count = :favorite_count, retweet_count = :retweet_count, label = :label
31 | WHERE
32 | EXCLUDED.label != 0 AND tweet.label != EXCLUDED.label
33 | ;`, t); err != nil {
34 | return err
35 | }
36 | }
37 | return nil
38 | }
39 |
40 | func (r *repository) UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error {
41 | if _, err := r.db.Exec(`UPDATE tweet SET label = $1 WHERE example_id = $2 AND id_str = $3;`, label, exampleId, idStr); err != nil {
42 | return err
43 | }
44 | return nil
45 | }
46 |
47 | type exampleIdWithTweetsCount struct {
48 | ExampleId int `db:"example_id"`
49 | TweetsCount int `db:"tweets_count"`
50 | }
51 |
52 | func (r *repository) SearchReferringTweetsList(examples model.Examples, limitForEachExample int) (map[int]model.ReferringTweets, error) {
53 | referringTweetsByExampleId := make(map[int]model.ReferringTweets)
54 |
55 | exampleIds := make([]int, 0)
56 | for _, e := range examples {
57 | exampleIds = append(exampleIds, e.Id)
58 | }
59 |
60 | exampleIdsWithTweetsCount := make([]exampleIdWithTweetsCount, 0)
61 | tweetsCountByExampleQuery := `SELECT example_id, COUNT(*) AS tweets_count FROM tweet WHERE example_id = ANY($1) GROUP BY example_id ORDER BY tweets_count DESC;`
62 | err := r.db.Select(&exampleIdsWithTweetsCount, tweetsCountByExampleQuery, pq.Array(exampleIds))
63 | if err != nil {
64 | return referringTweetsByExampleId, err
65 | }
66 | tweetsCountByExampleId := make(map[int]int)
67 | for _, e := range exampleIdsWithTweetsCount {
68 | tweetsCountByExampleId[e.ExampleId] = e.TweetsCount
69 | }
70 |
71 | if limitForEachExample == 0 {
72 | for _, exampleId := range exampleIds {
73 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
74 | if cnt, ok := tweetsCountByExampleId[exampleId]; ok {
75 | referringTweets.Count = cnt
76 | }
77 | referringTweetsByExampleId[exampleId] = referringTweets
78 | }
79 | return referringTweetsByExampleId, nil
80 | }
81 |
82 | tweets := make([]*model.Tweet, 0)
83 | query := `SELECT * FROM tweet WHERE example_id = ANY($1) AND label != -1 AND score > -1.0 AND (lang = 'en' OR lang = 'ja') ORDER BY favorite_count DESC LIMIT $2;`
84 | err = r.db.Select(&tweets, query, pq.Array(exampleIds), limitForEachExample)
85 | if err != nil {
86 | return referringTweetsByExampleId, err
87 | }
88 | tweetsByExampleId := make(map[int][]*model.Tweet)
89 | for _, t := range tweets {
90 | tweetsByExampleId[t.ExampleId] = append(tweetsByExampleId[t.ExampleId], t)
91 | }
92 |
93 | for _, exampleId := range exampleIds {
94 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
95 | if tweets, ok := tweetsByExampleId[exampleId]; ok {
96 | referringTweets.Tweets = tweets
97 | }
98 | if cnt, ok := tweetsCountByExampleId[exampleId]; ok {
99 | referringTweets.Count = cnt
100 | }
101 | referringTweetsByExampleId[exampleId] = referringTweets
102 | }
103 | return referringTweetsByExampleId, nil
104 | }
105 |
106 | func (r *repository) SearchReferringTweets(limit int) (model.ReferringTweets, error) {
107 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
108 | query := `SELECT * FROM tweet WHERE lang = 'en' OR lang = 'ja' ORDER BY created_at DESC LIMIT $1;`
109 | err := r.db.Select(&referringTweets.Tweets, query, limit)
110 | if err != nil {
111 | return referringTweets, err
112 | }
113 | referringTweets.Count = len(referringTweets.Tweets)
114 | return referringTweets, nil
115 | }
116 |
117 | func (r *repository) SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) {
118 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
119 | query := `
120 | SELECT
121 | tweet.id,
122 | tweet.example_id,
123 |
124 | tweet.created_at,
125 | tweet.id_str,
126 | tweet.full_text,
127 | tweet.favorite_count,
128 | tweet.retweet_count,
129 | tweet.lang,
130 |
131 | tweet.screen_name,
132 | tweet.name,
133 | tweet.profile_image_url,
134 | tweet.label,
135 | tweet.score
136 | FROM
137 | tweet
138 | INNER JOIN
139 | example ON example.id = example_id
140 | WHERE
141 | tweet.created_at > $1 AND
142 | tweet.label != -1 AND
143 | example.label != -1 AND
144 | tweet.score > $2 AND
145 | (favorite_count > 0 OR retweet_count > 0) AND
146 | (lang = 'en' OR lang = 'ja')
147 | ORDER BY tweet.score DESC
148 | LIMIT $3
149 | ;
150 | `
151 | err := r.db.Select(&referringTweets.Tweets, query, from, scoreThreshold, limit)
152 | if err != nil {
153 | return referringTweets, err
154 | }
155 | referringTweets.Count = len(referringTweets.Tweets)
156 | return referringTweets, nil
157 | }
158 |
159 | func (r *repository) searchReferringTweetsByLabel(label model.LabelType, scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
160 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
161 | query := `
162 | WITH t AS (
163 | SELECT
164 | id,
165 | ROW_NUMBER() OVER(partition BY example_id ORDER BY favorite_count DESC) AS rank_example_id,
166 | ROW_NUMBER() OVER(partition BY id_str ORDER BY favorite_count DESC) AS rank_id_str
167 | FROM
168 | tweet
169 | WHERE
170 | example_id IN (SELECT id FROM example WHERE label != -1 AND updated_at > NOW() - INTERVAL '30 DAYS')
171 | AND label = $1 AND (lang = 'en' OR lang = 'ja') AND score > $2
172 | )
173 |
174 | SELECT
175 | *
176 | FROM
177 | tweet
178 | WHERE
179 | id IN (SELECT id FROM t WHERE rank_example_id <= $3 AND rank_id_str = 1)
180 | ORDER BY
181 | created_at DESC
182 | LIMIT $4
183 | ;`
184 | err := r.db.Select(&referringTweets.Tweets, query, label, scoreThreshold, tweetsLimitInSameExample, limit)
185 | if err != nil {
186 | return referringTweets, err
187 | }
188 | referringTweets.Count = len(referringTweets.Tweets)
189 | return referringTweets, nil
190 | }
191 |
192 | func (r *repository) SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
193 | return r.searchReferringTweetsByLabel(model.POSITIVE, scoreThreshold, tweetsLimitInSameExample, limit)
194 | }
195 |
196 | func (r *repository) SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
197 | return r.searchReferringTweetsByLabel(model.NEGATIVE, scoreThreshold, tweetsLimitInSameExample, limit)
198 | }
199 |
200 | func (r *repository) SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
201 | return r.searchReferringTweetsByLabel(model.UNLABELED, scoreThreshold, tweetsLimitInSameExample, limit)
202 | }
203 |
204 | type tweetsCount struct {
205 | Count int `db:"count"`
206 | }
207 |
208 | func (r *repository) FindReferringTweets(e *model.Example, limit int) (model.ReferringTweets, error) {
209 | referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
210 |
211 | countQuery := `SELECT COUNT(*) AS count FROM tweet WHERE example_id = $1;`
212 | cnt := tweetsCount{}
213 | err := r.db.Get(&cnt, countQuery, e.Id)
214 | if err != nil {
215 | return referringTweets, err
216 | }
217 | referringTweets.Count = cnt.Count
218 | if limit == 0 {
219 | return referringTweets, err
220 | }
221 |
222 | query := `SELECT * FROM tweet WHERE example_id = $1 AND label != -1 AND score > 0.0 AND (lang = 'en' OR lang = 'ja') ORDER BY favorite_count DESC LIMIT $2;`
223 | err = r.db.Select(&referringTweets.Tweets, query, e.Id, limit)
224 | if err != nil {
225 | return referringTweets, err
226 | }
227 | return referringTweets, nil
228 | }
229 |
--------------------------------------------------------------------------------
/lib/repository/tweet_test.go:
--------------------------------------------------------------------------------
1 | package repository_test
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/syou6162/go-active-learning/lib/example"
8 | "github.com/syou6162/go-active-learning/lib/model"
9 | "github.com/syou6162/go-active-learning/lib/repository"
10 | )
11 |
12 | func TestUpdateReferringTweets(t *testing.T) {
13 | repo, err := repository.New()
14 | if err != nil {
15 | t.Errorf(err.Error())
16 | }
17 | defer repo.Close()
18 |
19 | if err = repo.DeleteAllExamples(); err != nil {
20 | t.Error(err)
21 | }
22 |
23 | e := example.NewExample("http://hoge.com", model.UNLABELED)
24 | err = repo.UpdateOrCreateExample(e)
25 | if err != nil {
26 | t.Error(err)
27 | }
28 | now := time.Now()
29 | idStr := "1111111"
30 | t1 := model.Tweet{
31 | CreatedAt: now,
32 | IdStr: idStr,
33 | FullText: "hello world!!!",
34 | FavoriteCount: 10,
35 | RetweetCount: 10,
36 | Lang: "en",
37 | ScreenName: "syou6162",
38 | Name: "syou6162",
39 | ProfileImageUrl: "http://hogehoge.com/profile.png",
40 | Score: 1.0,
41 | }
42 |
43 | tweets := model.ReferringTweets{}
44 | tweets.Tweets = append(tweets.Tweets, &t1)
45 | tweets.Count = len(tweets.Tweets)
46 | e.ReferringTweets = &tweets
47 | if err = repo.UpdateOrCreateReferringTweets(e); err != nil {
48 | t.Error(err)
49 | }
50 |
51 | {
52 | result, err := repo.SearchReferringTweetsList(model.Examples{e}, 10)
53 | if err != nil {
54 | t.Error(err)
55 | }
56 | if len(result) == 0 {
57 | t.Error("result must not be empty")
58 | }
59 | if len(result[e.Id].Tweets) == 0 {
60 | t.Error("result must not be empty")
61 | }
62 | if result[e.Id].Count == 0 {
63 | t.Error("result must not be zero")
64 | }
65 | if result[e.Id].Tweets[0].Name != "syou6162" {
66 | t.Error("Name must be syou6162")
67 | }
68 | }
69 |
70 | {
71 | result, err := repo.FindReferringTweets(e, 10)
72 | if err != nil {
73 | t.Error(err)
74 | }
75 | if len(result.Tweets) == 0 {
76 | t.Error("result must not be empty")
77 | }
78 | if result.Count == 0 {
79 | t.Error("result must not be empty")
80 | }
81 | if result.Tweets[0].Name != "syou6162" {
82 | t.Error("Name must be syou6162")
83 | }
84 | }
85 |
86 | {
87 | result, err := repo.FindReferringTweets(e, 0)
88 | if err != nil {
89 | t.Error(err)
90 | }
91 | if len(result.Tweets) != 0 {
92 | t.Error("result must be empty")
93 | }
94 | if result.Count == 0 {
95 | t.Error("result must not be empty")
96 | }
97 | }
98 |
99 | {
100 | if err := repo.UpdateTweetLabel(e.Id, idStr, model.NEGATIVE); err != nil {
101 | t.Error(err)
102 | }
103 | result, err := repo.FindReferringTweets(e, 10)
104 | if err != nil {
105 | t.Error(err)
106 | }
107 | if len(result.Tweets) != 0 {
108 | t.Error("result must be empty")
109 | }
110 | if result.Count != 1 {
111 | t.Error("result must be 1")
112 | }
113 | }
114 | }
115 |
116 | func TestSearchReferringTweetsByLabel(t *testing.T) {
117 | repo, err := repository.New()
118 | if err != nil {
119 | t.Errorf(err.Error())
120 | }
121 | defer repo.Close()
122 |
123 | if err = repo.DeleteAllExamples(); err != nil {
124 | t.Error(err)
125 | }
126 |
127 | e := example.NewExample("http://hoge.com", model.UNLABELED)
128 | err = repo.UpdateOrCreateExample(e)
129 | if err != nil {
130 | t.Error(err)
131 | }
132 | now := time.Now()
133 | idStr := "1111111"
134 | t1 := model.Tweet{
135 | CreatedAt: now,
136 | IdStr: idStr,
137 | FullText: "hello world!!!",
138 | FavoriteCount: 10,
139 | RetweetCount: 10,
140 | Lang: "en",
141 | ScreenName: "syou6162",
142 | Name: "syou6162",
143 | ProfileImageUrl: "http://hogehoge.com/profile.png",
144 | Label: model.POSITIVE,
145 | }
146 |
147 | tweets := model.ReferringTweets{}
148 | tweets.Tweets = append(tweets.Tweets, &t1)
149 | tweets.Count = len(tweets.Tweets)
150 | e.ReferringTweets = &tweets
151 | if err = repo.UpdateOrCreateReferringTweets(e); err != nil {
152 | t.Error(err)
153 | }
154 |
155 | limit := 10
156 | {
157 | result, err := repo.SearchPositiveReferringTweets(-1.0, 3, limit)
158 | if err != nil {
159 | t.Error(err)
160 | }
161 | if len(result.Tweets) != 1 {
162 | t.Error("len(result) must be 1")
163 | }
164 | if result.Count != 1 {
165 | t.Error("Count must be 1")
166 | }
167 | }
168 | {
169 | result, err := repo.SearchNegativeReferringTweets(-1.0, 3, limit)
170 | if err != nil {
171 | t.Error(err)
172 | }
173 | if len(result.Tweets) != 0 {
174 | t.Error("len(result) must be empty")
175 | }
176 | if result.Count != 0 {
177 | t.Error("Count must be zero")
178 | }
179 | }
180 | }
181 |
182 | func TestSearchRecentReferringTweetsWithHighScore(t *testing.T) {
183 | repo, err := repository.New()
184 | if err != nil {
185 | t.Errorf(err.Error())
186 | }
187 | defer repo.Close()
188 |
189 | if err = repo.DeleteAllExamples(); err != nil {
190 | t.Error(err)
191 | }
192 |
193 | e := example.NewExample("http://hoge.com", model.UNLABELED)
194 | err = repo.UpdateOrCreateExample(e)
195 | if err != nil {
196 | t.Error(err)
197 | }
198 | now := time.Now()
199 | t1 := model.Tweet{
200 | CreatedAt: now,
201 | IdStr: "1111111",
202 | FullText: "hello world!!!",
203 | FavoriteCount: 10,
204 | RetweetCount: 10,
205 | Lang: "en",
206 | ScreenName: "syou6162",
207 | Name: "syou6162",
208 | ProfileImageUrl: "http://hogehoge.com/profile.png",
209 | Label: model.POSITIVE,
210 | Score: 10.0,
211 | }
212 | t2 := model.Tweet{
213 | CreatedAt: now,
214 | IdStr: "22222222",
215 | FullText: "hello world!!!",
216 | FavoriteCount: 10,
217 | RetweetCount: 10,
218 | Lang: "en",
219 | ScreenName: "syou6162",
220 | Name: "syou6162",
221 | ProfileImageUrl: "http://hogehoge.com/profile.png",
222 | Label: model.POSITIVE,
223 | Score: 10.0,
224 | }
225 | t3 := model.Tweet{
226 | CreatedAt: now,
227 | IdStr: "3333333333",
228 | FullText: "hello world!!!",
229 | FavoriteCount: 10,
230 | RetweetCount: 10,
231 | Lang: "en",
232 | ScreenName: "syou6162",
233 | Name: "syou6162",
234 | ProfileImageUrl: "http://hogehoge.com/profile.png",
235 | Label: model.POSITIVE,
236 | Score: -10.0,
237 | }
238 |
239 | tweets := model.ReferringTweets{}
240 | tweets.Tweets = append(tweets.Tweets, &t1, &t2, &t3)
241 | tweets.Count = len(tweets.Tweets)
242 | e.ReferringTweets = &tweets
243 | if err = repo.UpdateOrCreateReferringTweets(e); err != nil {
244 | t.Error(err)
245 | }
246 |
247 | limit := 10
248 | {
249 | result, err := repo.SearchRecentReferringTweetsWithHighScore(now.Add(time.Duration(-10*24)*time.Hour), 0.0, limit)
250 | if err != nil {
251 | t.Error(err)
252 | }
253 | if len(result.Tweets) != 2 {
254 | t.Error("len(result) must be 2")
255 | }
256 | if result.Count != 2 {
257 | t.Error("Count must be 2")
258 | }
259 | }
260 | }
261 |
--------------------------------------------------------------------------------
/lib/service/example.go:
--------------------------------------------------------------------------------
1 | package service
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "io"
7 | "log"
8 | "net/http"
9 | "runtime"
10 | "time"
11 |
12 | "math"
13 | "os"
14 | "strconv"
15 | "sync"
16 |
17 | "github.com/syou6162/go-active-learning/lib/example"
18 | "github.com/syou6162/go-active-learning/lib/fetcher"
19 | "github.com/syou6162/go-active-learning/lib/model"
20 | "github.com/syou6162/go-active-learning/lib/util"
21 | )
22 |
23 | func (app *goActiveLearningApp) UpdateOrCreateExample(e *model.Example) error {
24 | return app.repo.UpdateOrCreateExample(e)
25 | }
26 |
27 | func (app *goActiveLearningApp) UpdateScore(e *model.Example) error {
28 | return app.repo.UpdateScore(e)
29 | }
30 |
31 | func (app *goActiveLearningApp) InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) {
32 | return app.repo.InsertExampleFromScanner(scanner)
33 | }
34 |
35 | func (app *goActiveLearningApp) InsertExamplesFromReader(reader io.Reader) error {
36 | return app.repo.InsertExamplesFromReader(reader)
37 | }
38 |
39 | func (app *goActiveLearningApp) SearchExamples() (model.Examples, error) {
40 | return app.repo.SearchExamples()
41 | }
42 |
43 | func (app *goActiveLearningApp) SearchRecentExamples(from time.Time, limit int) (model.Examples, error) {
44 | return app.repo.SearchRecentExamples(from, limit)
45 | }
46 |
47 | func (app *goActiveLearningApp) SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) {
48 | return app.repo.SearchRecentExamplesByHost(host, from, limit)
49 | }
50 |
51 | func (app *goActiveLearningApp) SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) {
52 | return app.repo.SearchExamplesByLabel(label, limit)
53 | }
54 |
55 | func (app *goActiveLearningApp) SearchLabeledExamples(limit int) (model.Examples, error) {
56 | return app.repo.SearchLabeledExamples(limit)
57 | }
58 |
59 | func (app *goActiveLearningApp) SearchPositiveExamples(limit int) (model.Examples, error) {
60 | return app.repo.SearchPositiveExamples(limit)
61 | }
62 |
63 | func (app *goActiveLearningApp) SearchNegativeExamples(limit int) (model.Examples, error) {
64 | return app.repo.SearchNegativeExamples(limit)
65 | }
66 |
67 | func (app *goActiveLearningApp) SearchUnlabeledExamples(limit int) (model.Examples, error) {
68 | return app.repo.SearchUnlabeledExamples(limit)
69 | }
70 |
71 | func (app *goActiveLearningApp) SearchPositiveScoredExamples(limit int) (model.Examples, error) {
72 | return app.repo.SearchPositiveScoredExamples(limit)
73 | }
74 |
75 | func (app *goActiveLearningApp) FindExampleByUlr(url string) (*model.Example, error) {
76 | return app.repo.FindExampleByUlr(url)
77 | }
78 |
79 | func (app *goActiveLearningApp) FindExampleById(id int) (*model.Example, error) {
80 | return app.repo.FindExampleById(id)
81 | }
82 |
83 | func (app *goActiveLearningApp) SearchExamplesByUlrs(urls []string) (model.Examples, error) {
84 | return app.repo.SearchExamplesByUlrs(urls)
85 | }
86 |
87 | func (app *goActiveLearningApp) SearchExamplesByIds(ids []int) (model.Examples, error) {
88 | return app.repo.SearchExamplesByIds(ids)
89 | }
90 |
91 | func (app *goActiveLearningApp) SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) {
92 | return app.repo.SearchExamplesByKeywords(keywords, aggregator, limit)
93 | }
94 |
95 | func (app *goActiveLearningApp) DeleteAllExamples() error {
96 | return app.repo.DeleteAllExamples()
97 | }
98 |
99 | func (app *goActiveLearningApp) CountPositiveExamples() (int, error) {
100 | return app.repo.CountPositiveExamples()
101 | }
102 |
103 | func (app *goActiveLearningApp) CountNegativeExamples() (int, error) {
104 | return app.repo.CountNegativeExamples()
105 | }
106 |
107 | func (app *goActiveLearningApp) CountUnlabeledExamples() (int, error) {
108 | return app.repo.CountUnlabeledExamples()
109 | }
110 |
111 | func (app *goActiveLearningApp) UpdateFeatureVector(e *model.Example) error {
112 | return app.repo.UpdateFeatureVector(e)
113 | }
114 |
115 | func (app *goActiveLearningApp) UpdateHatenaBookmark(e *model.Example) error {
116 | return app.repo.UpdateHatenaBookmark(e)
117 | }
118 |
119 | func (app *goActiveLearningApp) UpdateOrCreateReferringTweets(e *model.Example) error {
120 | return app.repo.UpdateOrCreateReferringTweets(e)
121 | }
122 |
123 | func (app *goActiveLearningApp) UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error {
124 | return app.repo.UpdateTweetLabel(exampleId, idStr, label)
125 | }
126 |
127 | func (app *goActiveLearningApp) SearchReferringTweets(limit int) (model.ReferringTweets, error) {
128 | return app.repo.SearchReferringTweets(limit)
129 | }
130 |
131 | func (app *goActiveLearningApp) SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
132 | return app.repo.SearchPositiveReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit)
133 | }
134 |
135 | func (app *goActiveLearningApp) SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
136 | return app.repo.SearchNegativeReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit)
137 | }
138 |
139 | func (app *goActiveLearningApp) SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
140 | return app.repo.SearchUnlabeledReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit)
141 | }
142 |
143 | func (app *goActiveLearningApp) SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) {
144 | return app.repo.SearchRecentReferringTweetsWithHighScore(from, scoreThreshold, limit)
145 | }
146 |
147 | func hatenaBookmarkByExampleId(hatenaBookmarks []*model.HatenaBookmark) map[int]*model.HatenaBookmark {
148 | result := make(map[int]*model.HatenaBookmark)
149 | for _, hb := range hatenaBookmarks {
150 | result[hb.ExampleId] = hb
151 | }
152 | return result
153 | }
154 |
155 | func (app *goActiveLearningApp) AttachMetadataIncludingFeatureVector(examples model.Examples, bookmarkLimit int, tweetLimit int) error {
156 | // make sure that example id must be filled
157 | for _, e := range examples {
158 | if e.Id == 0 {
159 | tmp, err := app.FindExampleByUlr(e.Url)
160 | if err != nil {
161 | return err
162 | }
163 | e.Id = tmp.Id
164 | }
165 | }
166 |
167 | fvList, err := app.repo.SearchFeatureVector(examples)
168 | if err != nil {
169 | return err
170 | }
171 |
172 | for _, e := range examples {
173 | if fv, ok := fvList[e.Id]; ok {
174 | e.Fv = fv
175 | }
176 | }
177 |
178 | return app.AttachMetadata(examples, bookmarkLimit, tweetLimit)
179 | }
180 |
181 | func (app *goActiveLearningApp) AttachMetadata(examples model.Examples, bookmarkLimit int, tweetLimit int) error {
182 | hatenaBookmarks, err := app.repo.SearchHatenaBookmarks(examples, bookmarkLimit)
183 | if err != nil {
184 | return err
185 | }
186 | hbByid := hatenaBookmarkByExampleId(hatenaBookmarks)
187 | for _, e := range examples {
188 | if b, ok := hbByid[e.Id]; ok {
189 | e.HatenaBookmark = b
190 | } else {
191 | e.HatenaBookmark = &model.HatenaBookmark{Bookmarks: []*model.Bookmark{}}
192 | }
193 | }
194 |
195 | referringTweetsById, err := app.repo.SearchReferringTweetsList(examples, tweetLimit)
196 | if err != nil {
197 | return err
198 | }
199 | for _, e := range examples {
200 | if t, ok := referringTweetsById[e.Id]; ok {
201 | e.ReferringTweets = &t
202 | } else {
203 | e.ReferringTweets = &model.ReferringTweets{}
204 | }
205 | }
206 | return nil
207 | }
208 |
209 | func (app *goActiveLearningApp) UpdateRelatedExamples(related model.RelatedExamples) error {
210 | return app.repo.UpdateRelatedExamples(related)
211 | }
212 |
213 | func (app *goActiveLearningApp) SearchRelatedExamples(e *model.Example) (model.Examples, error) {
214 | related, err := app.repo.FindRelatedExamples(e)
215 | if err != nil {
216 | return nil, err
217 | }
218 | return app.repo.SearchExamplesByIds(related.RelatedExampleIds)
219 | }
220 |
221 | func (app *goActiveLearningApp) UpdateTopAccessedExampleIds(exampleIds []int) error {
222 | return app.repo.UpdateTopAccessedExampleIds(exampleIds)
223 | }
224 |
225 | func (app *goActiveLearningApp) SearchTopAccessedExamples() (model.Examples, error) {
226 | exampleIds, err := app.repo.SearchTopAccessedExampleIds()
227 | if err != nil {
228 | return nil, err
229 | }
230 | return app.repo.SearchExamplesByIds(exampleIds)
231 | }
232 |
233 | func (app *goActiveLearningApp) UpdateRecommendation(listName string, examples model.Examples) error {
234 | listType, err := model.GetRecommendationListType(listName)
235 | if err != nil {
236 | return err
237 | }
238 |
239 | exampleIds := make([]int, 0)
240 | for _, e := range examples {
241 | exampleIds = append(exampleIds, e.Id)
242 | }
243 |
244 | rec := model.Recommendation{RecommendationListType: listType, ExampleIds: exampleIds}
245 | return app.repo.UpdateRecommendation(rec)
246 | }
247 |
248 | func (app *goActiveLearningApp) GetRecommendation(listName string) (model.Examples, error) {
249 | listType, err := model.GetRecommendationListType(listName)
250 | if err != nil {
251 | return nil, err
252 | }
253 | rec, err := app.repo.FindRecommendation(listType)
254 | return app.repo.SearchExamplesByIds(rec.ExampleIds)
255 | }
256 |
257 | func (app *goActiveLearningApp) splitExamplesByStatusOK(examples model.Examples) (model.Examples, model.Examples, error) {
258 | urls := make([]string, 0)
259 | exampleByurl := make(map[string]*model.Example)
260 | for _, e := range examples {
261 | exampleByurl[e.Url] = e
262 | urls = append(urls, e.Url)
263 | }
264 | tmpExamples, err := app.SearchExamplesByUlrs(urls)
265 | if err != nil {
266 | return nil, nil, err
267 | }
268 |
269 | examplesWithMetaData := model.Examples{}
270 | examplesWithEmptyMetaData := model.Examples{}
271 | for _, e := range tmpExamples {
272 | if e.StatusCode == http.StatusOK {
273 | examplesWithMetaData = append(examplesWithMetaData, exampleByurl[e.Url])
274 | delete(exampleByurl, e.Url)
275 | } else {
276 | examplesWithEmptyMetaData = append(examplesWithEmptyMetaData, exampleByurl[e.Url])
277 | delete(exampleByurl, e.Url)
278 | }
279 | }
280 | for _, e := range exampleByurl {
281 | examplesWithEmptyMetaData = append(examplesWithEmptyMetaData, e)
282 | }
283 | return examplesWithMetaData, examplesWithEmptyMetaData, nil
284 | }
285 |
286 | func fetchMetaData(e *model.Example) error {
287 | article, err := fetcher.GetArticle(e.Url)
288 | if err != nil {
289 | return err
290 | }
291 |
292 | e.Title = article.Title
293 | e.FinalUrl = article.Url
294 | e.Description = article.Description
295 | e.OgDescription = article.OgDescription
296 | e.OgType = article.OgType
297 | e.OgImage = article.OgImage
298 | e.Body = article.Body
299 | e.StatusCode = article.StatusCode
300 | e.Favicon = article.Favicon
301 |
302 | now := time.Now()
303 | tooOldDate := time.Date(2000, time.January, 1, 1, 1, 0, 0, time.UTC)
304 | if article.PublishDate != nil && (now.After(*article.PublishDate) || tooOldDate.Before(*article.PublishDate)) {
305 | e.CreatedAt = *article.PublishDate
306 | e.UpdatedAt = *article.PublishDate
307 | }
308 |
309 | fv := util.RemoveDuplicate(example.ExtractFeatures(*e))
310 | if len(fv) > 100000 {
311 | return fmt.Errorf("too large features (N = %d) for %s", len(fv), e.FinalUrl)
312 | }
313 | e.Fv = fv
314 |
315 | return nil
316 | }
317 |
318 | func (app *goActiveLearningApp) Fetch(examples model.Examples) {
319 | batchSize := 100
320 | examplesList := make([]model.Examples, 0)
321 | n := len(examples)
322 |
323 | for i := 0; i < n; i += batchSize {
324 | max := int(math.Min(float64(i+batchSize), float64(n)))
325 | examplesList = append(examplesList, examples[i:max])
326 | }
327 | for _, l := range examplesList {
328 | examplesWithMetaData, examplesWithEmptyMetaData, err := app.splitExamplesByStatusOK(l)
329 | if err != nil {
330 | log.Println(err.Error())
331 | }
332 | // ToDo: 本当に必要か考える
333 | app.AttachMetadataIncludingFeatureVector(examplesWithMetaData, 0, 0)
334 |
335 | wg := &sync.WaitGroup{}
336 | cpus := runtime.NumCPU()
337 | runtime.GOMAXPROCS(cpus)
338 | sem := make(chan struct{}, batchSize)
339 | for idx, e := range examplesWithEmptyMetaData {
340 | wg.Add(1)
341 | sem <- struct{}{}
342 | go func(e *model.Example, idx int) {
343 | defer wg.Done()
344 | cnt, err := app.repo.GetErrorCount(e)
345 | if err != nil {
346 | log.Println(err.Error())
347 | }
348 | if cnt < 5 {
349 | fmt.Fprintln(os.Stderr, "Fetching("+strconv.Itoa(idx)+"): "+e.Url)
350 | if err := fetchMetaData(e); err != nil {
351 | app.repo.IncErrorCount(e)
352 | log.Println(err.Error())
353 | }
354 | }
355 | <-sem
356 | }(e, idx)
357 | }
358 | wg.Wait()
359 | }
360 | }
361 |
--------------------------------------------------------------------------------
/lib/service/example_test.go:
--------------------------------------------------------------------------------
1 | package service_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/example"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | "github.com/syou6162/go-active-learning/lib/service"
9 | )
10 |
11 | func findExampleByurl(examples model.Examples, url string) *model.Example {
12 | for _, e := range examples {
13 | if e.Url == url {
14 | return e
15 | }
16 | }
17 | return nil
18 | }
19 |
20 | func TestAttachMetaData(t *testing.T) {
21 | app, err := service.NewDefaultApp()
22 | if err != nil {
23 | t.Error(err)
24 | }
25 | defer app.Close()
26 | if err := app.DeleteAllExamples(); err != nil {
27 | t.Error("Cannot delete examples")
28 | }
29 |
30 | hatebuUrl := "https://b.hatena.ne.jp"
31 | myBlogUrl := "https://www.yasuhisay.info"
32 | githubUrl := "https://github.com"
33 | e1 := example.NewExample(hatebuUrl, model.POSITIVE)
34 | e2 := example.NewExample(myBlogUrl, model.NEGATIVE)
35 | e3 := example.NewExample(githubUrl, model.UNLABELED)
36 | examples := model.Examples{e1, e2, e3}
37 |
38 | hatebu := findExampleByurl(examples, hatebuUrl)
39 | if hatebu == nil {
40 | t.Errorf("Cannot find %s", hatebuUrl)
41 | }
42 | if hatebu.Title != "" {
43 | t.Errorf("Title must be empty for %s", hatebu.Url)
44 | }
45 | if len(hatebu.Fv) != 0 {
46 | t.Errorf("Feature vector must be empty for %s", hatebu.Url)
47 | }
48 | app.AttachMetadataIncludingFeatureVector(examples, 10, 10)
49 |
50 | if hatebu.Title != "" {
51 | t.Errorf("Title must be empty for %s", hatebu.Url)
52 | }
53 | if len(hatebu.Fv) != 0 {
54 | t.Errorf("Feature vector must be empty for %s", hatebu.Url)
55 | }
56 |
57 | myBlog := findExampleByurl(examples, myBlogUrl)
58 | if myBlog == nil {
59 | t.Errorf("Cannot find %s", myBlogUrl)
60 | }
61 | if myBlog.OgType != "" {
62 | t.Errorf("OgType must be empty for %s", myBlog.Url)
63 | }
64 |
65 | app.Fetch(examples)
66 | for _, e := range examples {
67 | err = app.UpdateOrCreateExample(e)
68 | if err != nil {
69 | t.Error(err)
70 | }
71 | err = app.UpdateFeatureVector(e)
72 | if err != nil {
73 | t.Error(err)
74 | }
75 | }
76 | if hatebu.Title == "" {
77 | t.Errorf("Title must not be empty for %s", hatebu.Url)
78 | }
79 | if len(hatebu.Fv) == 0 {
80 | t.Errorf("Feature vector must not be empty for %s", hatebu.Url)
81 | }
82 |
83 | if myBlog.OgType != "blog" {
84 | t.Errorf("OgType must be blog for %s", myBlog.Url)
85 | }
86 |
87 | examples, err = app.SearchExamplesByIds([]int{e1.Id, e2.Id, e3.Id})
88 | if err != nil {
89 | t.Error(err)
90 | }
91 | err = app.AttachMetadataIncludingFeatureVector(examples, 10, 10)
92 | if err != nil {
93 | t.Error(err)
94 | }
95 |
96 | if hatebu.Title == "" {
97 | t.Errorf("Title must be empty for %s", hatebu.Url)
98 | }
99 | if len(hatebu.Fv) == 0 {
100 | t.Errorf("Feature vector must not be empty for %s", hatebu.Url)
101 | }
102 |
103 | if myBlog.OgType != "blog" {
104 | t.Errorf("OgType must be blog for %s", myBlog.Url)
105 | }
106 | }
107 |
108 | func TestGetRecommendation(t *testing.T) {
109 | app, err := service.NewDefaultApp()
110 | if err != nil {
111 | t.Error(err)
112 | }
113 | defer app.Close()
114 | if err := app.DeleteAllExamples(); err != nil {
115 | t.Error("Cannot delete examples")
116 | }
117 |
118 | e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
119 | e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
120 | e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
121 | examples := model.Examples{e1, e2, e3}
122 | for _, e := range examples {
123 | err = app.UpdateOrCreateExample(e)
124 | if err != nil {
125 | t.Error(err)
126 | }
127 | }
128 |
129 | listName := "general"
130 | err = app.UpdateRecommendation(listName, examples)
131 | if err != nil {
132 | t.Error(err)
133 | }
134 | examples, err = app.GetRecommendation(listName)
135 | if err != nil {
136 | t.Error(err)
137 | }
138 | if len(examples) != 3 {
139 | t.Errorf("len(examples) should be 3, but %d", len(examples))
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/lib/service/service.go:
--------------------------------------------------------------------------------
1 | package service
2 |
3 | import (
4 | "bufio"
5 | "io"
6 | "time"
7 |
8 | "github.com/syou6162/go-active-learning/lib/classifier"
9 | "github.com/syou6162/go-active-learning/lib/model"
10 | "github.com/syou6162/go-active-learning/lib/repository"
11 | )
12 |
13 | type GoActiveLearningApp interface {
14 | UpdateOrCreateExample(e *model.Example) error
15 | UpdateScore(e *model.Example) error
16 | InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error)
17 | InsertExamplesFromReader(reader io.Reader) error
18 | SearchExamples() (model.Examples, error)
19 | SearchRecentExamples(from time.Time, limit int) (model.Examples, error)
20 | SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error)
21 | SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error)
22 | SearchLabeledExamples(limit int) (model.Examples, error)
23 | SearchPositiveExamples(limit int) (model.Examples, error)
24 | SearchNegativeExamples(limit int) (model.Examples, error)
25 | SearchUnlabeledExamples(limit int) (model.Examples, error)
26 | SearchPositiveScoredExamples(limit int) (model.Examples, error)
27 | FindExampleByUlr(url string) (*model.Example, error)
28 | FindExampleById(id int) (*model.Example, error)
29 | SearchExamplesByUlrs(urls []string) (model.Examples, error)
30 | SearchExamplesByIds(ids []int) (model.Examples, error)
31 | SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error)
32 | DeleteAllExamples() error
33 | CountPositiveExamples() (int, error)
34 | CountNegativeExamples() (int, error)
35 | CountUnlabeledExamples() (int, error)
36 |
37 | InsertMIRAModel(m classifier.MIRAClassifier) error
38 | FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error)
39 |
40 | UpdateFeatureVector(e *model.Example) error
41 | UpdateHatenaBookmark(e *model.Example) error
42 | UpdateOrCreateReferringTweets(e *model.Example) error
43 | UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error
44 | SearchReferringTweets(limit int) (model.ReferringTweets, error)
45 | SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
46 | SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
47 | SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
48 | SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error)
49 | Fetch(examples model.Examples)
50 |
51 | AttachMetadataIncludingFeatureVector(examples model.Examples, bookmarkLimit int, tweetLimit int) error
52 | AttachMetadata(examples model.Examples, bookmarkLimit, tweetLimit int) error
53 |
54 | UpdateRecommendation(listName string, examples model.Examples) error
55 | GetRecommendation(listName string) (model.Examples, error)
56 |
57 | UpdateRelatedExamples(related model.RelatedExamples) error
58 | SearchRelatedExamples(e *model.Example) (model.Examples, error)
59 |
60 | UpdateTopAccessedExampleIds(exampleIds []int) error
61 | SearchTopAccessedExamples() (model.Examples, error)
62 |
63 | Ping() error
64 | Close() error
65 | }
66 |
67 | func NewApp(repo repository.Repository) GoActiveLearningApp {
68 | return &goActiveLearningApp{repo: repo}
69 | }
70 |
71 | func NewDefaultApp() (GoActiveLearningApp, error) {
72 | repo, err := repository.New()
73 | if err != nil {
74 | return nil, err
75 | }
76 | return &goActiveLearningApp{repo: repo}, nil
77 | }
78 |
79 | type goActiveLearningApp struct {
80 | repo repository.Repository
81 | }
82 |
83 | func (app *goActiveLearningApp) InsertMIRAModel(m classifier.MIRAClassifier) error {
84 | return app.repo.InsertMIRAModel(m)
85 | }
86 |
87 | func (app *goActiveLearningApp) FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) {
88 | return app.repo.FindLatestMIRAModel(modelType)
89 | }
90 |
91 | func (app *goActiveLearningApp) Ping() error {
92 | if err := app.repo.Ping(); err != nil {
93 | return err
94 | }
95 | return nil
96 | }
97 |
98 | func (app *goActiveLearningApp) Close() error {
99 | if err := app.repo.Close(); err != nil {
100 | return err
101 | }
102 | return nil
103 | }
104 |
--------------------------------------------------------------------------------
/lib/top_accessed_example/top_accessed_example.go:
--------------------------------------------------------------------------------
1 | package top_accessed_example
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "strconv"
7 |
8 | "os"
9 |
10 | "github.com/syou6162/go-active-learning/lib/service"
11 | "github.com/urfave/cli"
12 | )
13 |
14 | func parseLine(line string) (int, error) {
15 | exampleId, err := strconv.ParseInt(line, 10, 0)
16 | if err != nil {
17 | return 0, fmt.Errorf("Invalid line: %s", line)
18 | }
19 | return int(exampleId), nil
20 | }
21 |
22 | func readTopAccessedExampleIds(filename string) ([]int, error) {
23 | fp, err := os.Open(filename)
24 | defer fp.Close()
25 | if err != nil {
26 | return nil, err
27 | }
28 |
29 | exampleIds := make([]int, 0)
30 | scanner := bufio.NewScanner(fp)
31 | for scanner.Scan() {
32 | line := scanner.Text()
33 | exampleId, err := parseLine(line)
34 | if err != nil {
35 | return nil, err
36 | }
37 | exampleIds = append(exampleIds, exampleId)
38 | }
39 | if err := scanner.Err(); err != nil {
40 | return nil, err
41 | }
42 | return exampleIds, nil
43 | }
44 |
45 | func doAddTopAccessedExamples(c *cli.Context) error {
46 | inputFilename := c.String("input-filename")
47 |
48 | if inputFilename == "" {
49 | _ = cli.ShowCommandHelp(c, "add-top-accessed-examples")
50 | return cli.NewExitError("`input-filename` is a required field.", 1)
51 | }
52 |
53 | app, err := service.NewDefaultApp()
54 | if err != nil {
55 | return err
56 | }
57 | defer app.Close()
58 |
59 | exampleIds, err := readTopAccessedExampleIds(inputFilename)
60 | if err != nil {
61 | return err
62 | }
63 | err = app.UpdateTopAccessedExampleIds(exampleIds)
64 | if err != nil {
65 | return err
66 | }
67 | return nil
68 | }
69 |
70 | var CommandAddTopAccessedExamples = cli.Command{
71 | Name: "add-top-accessed-examples",
72 | Usage: "add top accessed examples",
73 | Description: `
74 | Add top accessed examples.
75 | `,
76 | Action: doAddTopAccessedExamples,
77 | Flags: []cli.Flag{
78 | cli.StringFlag{Name: "input-filename"},
79 | },
80 | }
81 |
--------------------------------------------------------------------------------
/lib/util/converter/converter.go:
--------------------------------------------------------------------------------
1 | package converter
2 |
3 | import "github.com/syou6162/go-active-learning/lib/model"
4 | import "github.com/syou6162/go-active-learning/lib/classifier"
5 |
6 | func ConvertExamplesToLearningInstances(examples model.Examples) classifier.LearningInstances {
7 | instances := classifier.LearningInstances{}
8 | for _, e := range examples {
9 | instances = append(instances, e)
10 | }
11 | return instances
12 | }
13 |
--------------------------------------------------------------------------------
/lib/util/file/file.go:
--------------------------------------------------------------------------------
1 | package file
2 |
3 | import (
4 | "bufio"
5 | "errors"
6 | "fmt"
7 | "os"
8 | "strconv"
9 | "strings"
10 |
11 | "github.com/syou6162/go-active-learning/lib/example"
12 | "github.com/syou6162/go-active-learning/lib/model"
13 | )
14 |
15 | func ParseLine(line string) (*model.Example, error) {
16 | tokens := strings.Split(line, "\t")
17 | var url string
18 | if len(tokens) == 1 {
19 | url = tokens[0]
20 | return example.NewExample(url, model.UNLABELED), nil
21 | } else if len(tokens) == 2 {
22 | url = tokens[0]
23 | label, _ := strconv.ParseInt(tokens[1], 10, 0)
24 | switch model.LabelType(label) {
25 | case model.POSITIVE, model.NEGATIVE, model.UNLABELED:
26 | return example.NewExample(url, model.LabelType(label)), nil
27 | default:
28 | return nil, errors.New(fmt.Sprintf("Invalid Label type %d in %s", label, line))
29 | }
30 | } else {
31 | return nil, errors.New(fmt.Sprintf("Invalid line: %s", line))
32 | }
33 | }
34 |
35 | func ReadExamples(filename string) ([]*model.Example, error) {
36 | fp, err := os.Open(filename)
37 | defer fp.Close()
38 | if err != nil {
39 | return nil, err
40 | }
41 |
42 | scanner := bufio.NewScanner(fp)
43 | var examples model.Examples
44 | for scanner.Scan() {
45 | line := scanner.Text()
46 | e, err := ParseLine(line)
47 | if err != nil {
48 | return nil, err
49 | }
50 | examples = append(examples, e)
51 | }
52 | if err := scanner.Err(); err != nil {
53 | return nil, err
54 | }
55 | return examples, nil
56 | }
57 |
58 | func WriteExamples(examples model.Examples, filename string) error {
59 | fp, err := os.Create(filename)
60 | defer fp.Close()
61 | if err != nil {
62 | return err
63 | }
64 |
65 | writer := bufio.NewWriter(fp)
66 | for _, e := range examples {
67 | if e.IsNew && e.IsLabeled() {
68 | url := e.FinalUrl
69 | if url == "" {
70 | url = e.Url
71 | }
72 | _, err := writer.WriteString(url + "\t" + strconv.Itoa(int(e.Label)) + "\n")
73 | if err != nil {
74 | return err
75 | }
76 | }
77 | }
78 |
79 | writer.Flush()
80 | return nil
81 | }
82 |
--------------------------------------------------------------------------------
/lib/util/file/file_test.go:
--------------------------------------------------------------------------------
1 | package file
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/syou6162/go-active-learning/lib/example"
8 | "github.com/syou6162/go-active-learning/lib/model"
9 | )
10 |
11 | func TestParseLine(t *testing.T) {
12 | line1 := "http://model.com\t1"
13 | e, err := ParseLine(line1)
14 |
15 | if err != nil {
16 | t.Error("cannot parse line1")
17 | }
18 | if e.Label != model.POSITIVE {
19 | t.Error("Label must be POSITIVE")
20 | }
21 |
22 | line2 := "http://model.com\t-1"
23 | e, err = ParseLine(line2)
24 |
25 | if err != nil {
26 | t.Error("cannot parse line2")
27 | }
28 | if e.Label != model.NEGATIVE {
29 | t.Error("Label must be NEGATIVE")
30 | }
31 |
32 | line3 := "http://model.com"
33 | e, err = ParseLine(line3)
34 |
35 | if err != nil {
36 | t.Error("cannot parse line3")
37 | }
38 | if e.Label != model.UNLABELED {
39 | t.Error("Label must be UNLABELED")
40 | }
41 |
42 | line4 := "http://model.com\t2"
43 | e, err = ParseLine(line4)
44 |
45 | if e != nil {
46 | t.Error("wrong line format")
47 | }
48 | }
49 |
50 | func TestReadExamples(t *testing.T) {
51 | filename := "../../../tech_input_example.txt"
52 | examples, err := ReadExamples(filename)
53 |
54 | if err != nil {
55 | fmt.Println(err.Error())
56 | t.Error(fmt.Printf("Cannot read examples from %s\n", filename))
57 | }
58 | if len(examples) == 0 {
59 | t.Error(fmt.Printf("%s should contain more than one examples\n", filename))
60 | }
61 | }
62 |
63 | func TestWriteExamples(t *testing.T) {
64 | filename := ".write_test.txt"
65 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
66 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
67 |
68 | err := WriteExamples(model.Examples{e1, e2}, filename)
69 | if err != nil {
70 | t.Error(fmt.Printf("Cannot write examples to %s", filename))
71 | }
72 |
73 | examples, err := ReadExamples(filename)
74 | if err != nil {
75 | t.Error(fmt.Printf("Cannot read examples from %s", filename))
76 | }
77 | if len(examples) == 2 {
78 | t.Error(fmt.Printf("%s should contain two examples", filename))
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/lib/util/util.go:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/syou6162/go-active-learning/lib/model"
7 | )
8 |
9 | func FilterLabeledExamples(examples model.Examples) model.Examples {
10 | var result model.Examples
11 | for _, e := range examples {
12 | if e.IsLabeled() {
13 | result = append(result, e)
14 | }
15 | }
16 | return result
17 | }
18 |
19 | func FilterUnlabeledExamples(examples model.Examples) model.Examples {
20 | result := model.Examples{}
21 |
22 | alreadyLabeledByURL := make(map[string]bool)
23 | alreadyLabeledByTitle := make(map[string]bool)
24 | for _, e := range FilterLabeledExamples(examples) {
25 | alreadyLabeledByURL[e.Url] = true
26 | alreadyLabeledByURL[e.FinalUrl] = true
27 | alreadyLabeledByTitle[e.Title] = true
28 | }
29 |
30 | for _, e := range examples {
31 | if _, ok := alreadyLabeledByURL[e.Url]; ok {
32 | continue
33 | }
34 | if _, ok := alreadyLabeledByTitle[e.Title]; ok {
35 | continue
36 | }
37 | if !e.IsLabeled() {
38 | alreadyLabeledByURL[e.Url] = true
39 | alreadyLabeledByURL[e.FinalUrl] = true
40 | alreadyLabeledByTitle[e.Title] = true
41 | result = append(result, e)
42 | }
43 | }
44 | return result
45 | }
46 |
47 | func RemoveDuplicate(args []string) []string {
48 | results := make([]string, 0)
49 | encountered := map[string]bool{}
50 | for i := 0; i < len(args); i++ {
51 | if !encountered[args[i]] {
52 | encountered[args[i]] = true
53 | results = append(results, args[i])
54 | }
55 | }
56 | return results
57 | }
58 |
59 | func FilterStatusCodeOkExamples(examples model.Examples) model.Examples {
60 | result := model.Examples{}
61 |
62 | for _, e := range examples {
63 | if e.StatusCode == 200 {
64 | result = append(result, e)
65 | }
66 | }
67 |
68 | return result
69 | }
70 |
71 | func FilterStatusCodeNotOkExamples(examples model.Examples) model.Examples {
72 | result := model.Examples{}
73 |
74 | for _, e := range examples {
75 | if e.StatusCode != 200 {
76 | result = append(result, e)
77 | }
78 | }
79 |
80 | return result
81 | }
82 |
83 | func RemoveExample(examples model.Examples, toBeRemoved model.Example) model.Examples {
84 | result := model.Examples{}
85 |
86 | for _, e := range examples {
87 | if e.Url != toBeRemoved.Url {
88 | result = append(result, e)
89 | }
90 | }
91 |
92 | return result
93 | }
94 |
95 | func RemoveNegativeExamples(examples model.Examples) model.Examples {
96 | result := model.Examples{}
97 | for _, e := range examples {
98 | if e.Label != model.NEGATIVE {
99 | result = append(result, e)
100 | }
101 | }
102 | return result
103 | }
104 |
105 | func UniqueByFinalUrl(examples model.Examples) model.Examples {
106 | result := model.Examples{}
107 | m := make(map[string]bool)
108 | for _, e := range examples {
109 | if !m[e.FinalUrl] {
110 | m[e.FinalUrl] = true
111 | result = append(result, e)
112 | }
113 | }
114 | return result
115 | }
116 |
117 | func UniqueByTitle(examples model.Examples) model.Examples {
118 | result := model.Examples{}
119 | m := make(map[string]bool)
120 | for _, e := range examples {
121 | if !m[e.Title] {
122 | m[e.Title] = true
123 | result = append(result, e)
124 | }
125 | }
126 | return result
127 | }
128 |
129 | func GetEnv(key, fallback string) string {
130 | value, ok := os.LookupEnv(key)
131 | if !ok {
132 | value = fallback
133 | }
134 | return value
135 | }
136 |
--------------------------------------------------------------------------------
/lib/util/util_test.go:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/syou6162/go-active-learning/lib/example"
7 | "github.com/syou6162/go-active-learning/lib/model"
8 | )
9 |
10 | func TestFilterLabeledExamples(t *testing.T) {
11 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
12 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
13 | e3 := example.NewExample("http://google.com", model.UNLABELED)
14 |
15 | examples := FilterLabeledExamples(model.Examples{e1, e2, e3})
16 | if len(examples) != 2 {
17 | t.Error("Number of labeled examples should be 2")
18 | }
19 | }
20 |
21 | func TestFilterUnlabeledExamples(t *testing.T) {
22 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
23 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
24 | e3 := example.NewExample("http://google.com", model.UNLABELED)
25 | e3.Title = "Google"
26 |
27 | examples := FilterUnlabeledExamples(model.Examples{e1, e2, e3})
28 | if len(examples) != 1 {
29 | t.Error("Number of unlabeled examples should be 1")
30 | }
31 | }
32 |
33 | func TestFilterStatusCodeOkExamples(t *testing.T) {
34 | e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
35 | e1.StatusCode = 200
36 | e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
37 | e2.StatusCode = 404
38 | e3 := example.NewExample("http://google.com", model.UNLABELED)
39 | e3.StatusCode = 304
40 |
41 | examples := FilterStatusCodeOkExamples(model.Examples{e1, e2, e3})
42 | if len(examples) != 1 {
43 | t.Error("Number of examples (status code = 200) should be 1")
44 | }
45 | }
46 |
47 | func TestUniqueByFinalUrl(t *testing.T) {
48 | e1 := model.Example{FinalUrl: "aaa"}
49 | e2 := model.Example{FinalUrl: "bbb"}
50 | e3 := model.Example{FinalUrl: "aaa"}
51 | examples := model.Examples{&e1, &e2, &e3}
52 | result := UniqueByFinalUrl(examples)
53 | if len(result) != 2 {
54 | t.Errorf("length(result) should be %d, but %d", 2, len(result))
55 | }
56 | }
57 |
58 | func TestRemoveDuplicate(t *testing.T) {
59 | args := []string{"hoge", "fuga", "piyo", "hoge"}
60 |
61 | result := RemoveDuplicate(args)
62 | if len(result) != 3 {
63 | t.Error("Number of unique string in args should be 3")
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/syou6162/go-active-learning/lib/command"
8 | "github.com/urfave/cli"
9 | )
10 |
11 | func main() {
12 | app := cli.NewApp()
13 | app.Name = "go-active-learning"
14 | app.Commands = command.Commands
15 |
16 | if err := app.Run(os.Args); err != nil {
17 | fmt.Fprintln(os.Stderr, err)
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/migrations/0.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS example (
3 | "id" SERIAL NOT NULL PRIMARY KEY,
4 | "url" TEXT NOT NULL,
5 | "label" INT NOT NULL,
6 | "created_at" timestamp NOT NULL,
7 | "updated_at" timestamp NOT NULL
8 | );
9 |
10 | CREATE UNIQUE INDEX IF NOT EXISTS "url_idx_example" ON example ("url");
11 | CREATE INDEX IF NOT EXISTS "label_updated_at_idx_example" ON example ("label", "updated_at" DESC);
12 |
13 | -- +migrate Down
14 | DROP INDEX "url_idx_example";
15 | DROP INDEX "label_updated_at_idx_example";
16 |
17 | DROP TABLE example;
18 |
--------------------------------------------------------------------------------
/migrations/1.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "example" ADD COLUMN "final_url" TEXT DEFAULT '' NOT NULL;
3 | UPDATE "example" SET "final_url" = "url";
4 | ALTER TABLE "example" ALTER COLUMN "final_url" DROP DEFAULT;
5 |
6 | ALTER TABLE "example" ADD COLUMN "title" TEXT;
7 | ALTER TABLE "example" ADD COLUMN "description" TEXT;
8 | ALTER TABLE "example" ADD COLUMN "og_description" TEXT;
9 | ALTER TABLE "example" ADD COLUMN "og_type" TEXT;
10 | ALTER TABLE "example" ADD COLUMN "og_image" TEXT;
11 | ALTER TABLE "example" ADD COLUMN "body" TEXT;
12 | ALTER TABLE "example" ADD COLUMN "score" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
13 | ALTER TABLE "example" ADD COLUMN "is_new" BOOLEAN DEFAULT FALSE NOT NULL;
14 | ALTER TABLE "example" ADD COLUMN "status_code" INT DEFAULT 0 NOT NULL;
15 | ALTER TABLE "example" ADD COLUMN "favicon" TEXT;
16 |
17 | CREATE UNIQUE INDEX IF NOT EXISTS "final_url_idx_example" ON example ("final_url");
18 |
19 | -- +migrate Down
20 | DROP INDEX "final_url_idx_example";
21 |
22 | ALTER TABLE "example" DROP COLUMN "final_url";
23 | ALTER TABLE "example" DROP COLUMN "title";
24 | ALTER TABLE "example" DROP COLUMN "description";
25 | ALTER TABLE "example" DROP COLUMN "og_description";
26 | ALTER TABLE "example" DROP COLUMN "og_type";
27 | ALTER TABLE "example" DROP COLUMN "og_image";
28 | ALTER TABLE "example" DROP COLUMN "body";
29 | ALTER TABLE "example" DROP COLUMN "score";
30 | ALTER TABLE "example" DROP COLUMN "is_new";
31 | ALTER TABLE "example" DROP COLUMN "status_code";
32 | ALTER TABLE "example" DROP COLUMN "favicon";
33 |
--------------------------------------------------------------------------------
/migrations/10.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS recommendation (
3 | "list_type" INT NOT NULL,
4 | "example_id" SERIAL NOT NULL,
5 | CONSTRAINT recommendation_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
6 | );
7 |
8 | CREATE INDEX IF NOT EXISTS "list_type_idx_recommendation" ON recommendation ("list_type");
9 |
10 | -- +migrate Down
11 | DROP INDEX "list_type_idx_recommendation";
12 |
13 | DROP TABLE recommendation;
14 |
--------------------------------------------------------------------------------
/migrations/11.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "example" ADD COLUMN "error_count" INT NOT NULL DEFAULT 0;
3 |
4 | -- +migrate Down
5 | ALTER TABLE "example" DROP COLUMN "error_count";
6 |
--------------------------------------------------------------------------------
/migrations/12.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" ADD COLUMN "label" INT NOT NULL DEFAULT 0;
3 |
4 | -- +migrate Down
5 | ALTER TABLE "tweet" DROP COLUMN "label";
6 |
--------------------------------------------------------------------------------
/migrations/13.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "model" ADD COLUMN "model_type" INT NOT NULL DEFAULT 0;
3 | ALTER TABLE "model" ADD COLUMN "c" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
4 | ALTER TABLE "model" ADD COLUMN "accuracy" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
5 | ALTER TABLE "model" ADD COLUMN "precision" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
6 | ALTER TABLE "model" ADD COLUMN "recall" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
7 | ALTER TABLE "model" ADD COLUMN "fvalue" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
8 |
9 | DROP INDEX "created_at_model";
10 | CREATE INDEX IF NOT EXISTS "model_type_created_at_model" ON model ("model_type", "created_at");
11 |
12 | -- +migrate Down
13 | DROP INDEX "model_type_created_at_model";
14 |
15 | ALTER TABLE "model" DROP COLUMN "model_type";
16 | ALTER TABLE "model" DROP COLUMN "c";
17 | ALTER TABLE "model" DROP COLUMN "accuracy";
18 | ALTER TABLE "model" DROP COLUMN "precision";
19 | ALTER TABLE "model" DROP COLUMN "recall";
20 | ALTER TABLE "model" DROP COLUMN "fvalue";
21 |
22 | CREATE INDEX IF NOT EXISTS "created_at_model" ON model ("created_at");
23 |
--------------------------------------------------------------------------------
/migrations/14.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" ADD COLUMN "score" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
3 |
4 | -- +migrate Down
5 | ALTER TABLE "tweet" DROP COLUMN "score";
6 |
--------------------------------------------------------------------------------
/migrations/15.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS related_example (
3 | "example_id" SERIAL NOT NULL,
4 | "related_example_id" SERIAL NOT NULL,
5 | CONSTRAINT related_example_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE,
6 | CONSTRAINT related_example_related_example_id_fkey FOREIGN KEY ("related_example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE,
7 | CHECK(example_id != related_example_id)
8 | );
9 |
10 | CREATE INDEX IF NOT EXISTS "example_id_idx_related_example" ON related_example ("example_id");
11 |
12 | -- +migrate Down
13 | DROP INDEX "example_id_idx_related_example";
14 |
15 | DROP TABLE related_example;
16 |
--------------------------------------------------------------------------------
/migrations/16.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS top_accessed_example (
3 | "example_id" SERIAL NOT NULL,
4 | CONSTRAINT top_accessed_example_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
5 | );
6 |
7 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_idx_top_accessed_example" ON top_accessed_example ("example_id");
8 |
9 | -- +migrate Down
10 | DROP INDEX "example_id_idx_top_accessed_example";
11 |
12 | DROP TABLE top_accessed_example;
13 |
--------------------------------------------------------------------------------
/migrations/2.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS feature (
3 | "example_id" SERIAL NOT NULL,
4 | "feature" TEXT NOT NULL,
5 | CONSTRAINT feature_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
6 | );
7 |
8 | CREATE INDEX IF NOT EXISTS "example_id_idx_example" ON feature ("example_id");
9 |
10 | -- +migrate Down
11 | DROP INDEX "example_id_idx_example";
12 | DROP TABLE feature;
13 |
--------------------------------------------------------------------------------
/migrations/3.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "example" ALTER COLUMN "title" SET DEFAULT '';
3 | ALTER TABLE "example" ALTER COLUMN "description" SET DEFAULT '';
4 | ALTER TABLE "example" ALTER COLUMN "og_description" SET DEFAULT '';
5 | ALTER TABLE "example" ALTER COLUMN "og_type" SET DEFAULT '';
6 | ALTER TABLE "example" ALTER COLUMN "og_image" SET DEFAULT '';
7 | ALTER TABLE "example" ALTER COLUMN "body" SET DEFAULT '';
8 | ALTER TABLE "example" ALTER COLUMN "favicon" SET DEFAULT '';
9 |
10 | -- +migrate Down
11 | ALTER TABLE "example" ALTER COLUMN "title" DROP DEFAULT;
12 | ALTER TABLE "example" ALTER COLUMN "description" DROP DEFAULT;
13 | ALTER TABLE "example" ALTER COLUMN "og_description" DROP DEFAULT;
14 | ALTER TABLE "example" ALTER COLUMN "og_type" DROP DEFAULT;
15 | ALTER TABLE "example" ALTER COLUMN "og_image" DROP DEFAULT;
16 | ALTER TABLE "example" ALTER COLUMN "body" DROP DEFAULT;
17 | ALTER TABLE "example" ALTER COLUMN "favicon" DROP DEFAULT;
18 |
--------------------------------------------------------------------------------
/migrations/4.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | DROP INDEX "final_url_idx_example";
3 |
4 | -- +migrate Down
5 | CREATE UNIQUE INDEX IF NOT EXISTS "final_url_idx_example" ON example ("final_url");
6 |
--------------------------------------------------------------------------------
/migrations/5.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS hatena_bookmark (
3 | "id" SERIAL NOT NULL PRIMARY KEY,
4 | "example_id" SERIAL NOT NULL,
5 | "title" TEXT NOT NULL,
6 | "screenshot" TEXT NOT NULL,
7 | "entry_url" TEXT NOT NULL,
8 | "count" INT NOT NULL,
9 | "url" TEXT NOT NULL,
10 | "eid" TEXT NOT NULL,
11 | CONSTRAINT hatena_bookmark_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
12 | );
13 |
14 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_idx_hatena_bookmark" ON hatena_bookmark ("example_id");
15 | CREATE UNIQUE INDEX IF NOT EXISTS "url_idx_hatena_bookmark" ON hatena_bookmark ("url");
16 |
17 | CREATE TABLE IF NOT EXISTS bookmark (
18 | "hatena_bookmark_id" SERIAL NOT NULL,
19 | "user" TEXT NOT NULL,
20 | "comment" TEXT NOT NULL,
21 | "tags" TEXT NOT NULL,
22 | "timestamp" timestamp NOT NULL,
23 | CONSTRAINT bookmark_hatena_bookmark_id_fkey FOREIGN KEY ("hatena_bookmark_id") REFERENCES hatena_bookmark("id") ON UPDATE NO ACTION ON DELETE CASCADE
24 | );
25 |
26 | CREATE UNIQUE INDEX IF NOT EXISTS "hatena_bookmark_id_user_idx_bookmark" ON bookmark ("hatena_bookmark_id", "user");
27 |
28 | -- +migrate Down
29 | DROP INDEX "hatena_bookmark_id_user_idx_bookmark";
30 | DROP INDEX "example_id_idx_hatena_bookmark";
31 | DROP INDEX "url_idx_hatena_bookmark";
32 |
33 | DROP TABLE bookmark;
34 | DROP TABLE hatena_bookmark;
35 |
--------------------------------------------------------------------------------
/migrations/6.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS tweet (
3 | "id" SERIAL NOT NULL PRIMARY KEY,
4 | "example_id" SERIAL NOT NULL,
5 |
6 | "created_at" timestamp NOT NULL,
7 | "id_str" TEXT NOT NULL,
8 | "full_text" TEXT NOT NULL,
9 | "favorite_count" INT NOT NULL,
10 | "retweet_count" INT NOT NULL,
11 | "lang" TEXT NOT NULL,
12 |
13 | "screen_name" TEXT NOT NULL,
14 | "name" TEXT NOT NULL,
15 | "profile_image_url" TEXT NOT NULL,
16 |
17 | CONSTRAINT tweet_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
18 | );
19 |
20 | CREATE INDEX IF NOT EXISTS "example_id_idx_tweet" ON tweet ("example_id");
21 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_id_str_idx_tweet" ON tweet ("example_id", "id_str");
22 |
23 | -- +migrate Down
24 | DROP INDEX "example_id_id_str_idx_tweet";
25 | DROP INDEX "example_id_idx_tweet";
26 | DROP TABLE tweet;
27 |
--------------------------------------------------------------------------------
/migrations/7.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" ADD COLUMN "retweeted" BOOLEAN NOT NULL DEFAULT false;
3 |
4 | -- +migrate Down
5 | ALTER TABLE "tweet" DROP COLUMN "retweeted";
6 |
--------------------------------------------------------------------------------
/migrations/8.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" DROP COLUMN "retweeted";
3 |
4 | -- +migrate Down
5 | ALTER TABLE "tweet" ADD COLUMN "retweeted" BOOLEAN NOT NULL DEFAULT false;
6 |
--------------------------------------------------------------------------------
/migrations/9.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | CREATE TABLE IF NOT EXISTS model (
3 | "model" TEXT NOT NULL,
4 | "created_at" timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
5 | );
6 |
7 | CREATE INDEX IF NOT EXISTS "created_at_model" ON model ("created_at");
8 |
9 | -- +migrate Down
10 | DROP INDEX "created_at_model";
11 |
12 | DROP TABLE model;
13 |
--------------------------------------------------------------------------------
/script/create_database.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE "go-active-learning";
2 | CREATE DATABASE "go-active-learning-test";
3 |
4 | CREATE ROLE "nobody" WITH PASSWORD 'nobody' LOGIN;
5 |
6 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO nobody;
7 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO nobody;
8 |
--------------------------------------------------------------------------------
/tech_input_example.txt:
--------------------------------------------------------------------------------
1 | https://www.yasuhisay.info/ 1
2 | https://songmu.jp/riji/ 1
3 | https://blog.yuuk.io/ 1
4 | https://hakobe932.hatenablog.com/ 1
5 | https://motemen.hatenablog.com/ 1
6 | https://www3.nhk.or.jp/news/ -1
7 | https://www.facebook.com/ -1
8 | http://r.gnavi.co.jp/g-interview/ -1
9 | https://suumo.jp/town/ -1
10 | https://srdk.rakuten.jp/
11 | https://kuenishi.hatenadiary.jp/entry/2017/05/25/005527
12 | http://otiai10.hatenablog.com/entry/2017/05/24/163701
13 | https://www.yasuhisay.info/entry/2016/11/23/000000
14 | https://www.yasuhisay.info/entry/20090516/1242480413
15 | https://www.yasuhisay.info/entry/2017/05/18/080000
16 | https://arxiv.org/abs/1906.03776 1
17 | https://tech-blog.optim.co.jp/entry/2019/07/04/173000 1
18 | http://www.ai-gakkai.or.jp/my-bookmark_vol34-no4/ 1
19 | https://data.gunosy.io/entry/poincare_embedding_for_recommendations 1
20 | http://englishforhackers.com/ -1
21 | https://www.youtube.com/watch?v=5ZwknHMf1yo -1
22 | https://speakerdeck.com/livesense/shi-ye-heng-duan-zu-zhi-defalsemlsisutemukai-fa-yun-yong-toji-pan-she-ji 1
23 | https://www.yasuhisay.info/entry/splatoon2_udemae_x -1
24 | https://www.yasuhisay.info/entry/2018/11/13/090000 -1
25 | https://www.yasuhisay.info/entry/2016/09/26/080000 -1
26 | https://www.yasuhisay.info/entry/2016/03/27/215344 -1
27 | https://www.yasuhisay.info/entry/20110714/1310622171 -1
28 | https://anond.hatelabo.jp/20190713043218 -1
29 | https://cybozushiki.cybozu.co.jp/articles/m005412.html -1
30 | https://sorazine.soracom.jp/entry/2019/07/12/celestehair -1
31 | https://www.yasuhisay.info/entry/20090516/1242480413 1
32 | https://www.yasuhisay.info/entry/kaggle_avazu_ctr_prediction 1
33 | https://www.yasuhisay.info/entry/mlct_mackerel_anomaly_detection 1
34 | https://www.yasuhisay.info/entry/2018/10/04/201000 1
35 | https://honz.jp/articles/-/45278 -1
36 | https://www.megamouth.info/entry/2019/07/12/175250 -1
37 | https://www.lifehacker.jp/2019/07/193679_higedanshaku.html -1
38 | https://teineini.net/20190711-evernote-dokusyonote/ -1
39 | https://dev.classmethod.jp/tool/be-vimmer-by-trainings/ 1
40 | https://www.clear-code.com/blog/2019/7/12.html 1
41 | https://techlife.cookpad.com/entry/2019/07/13/055601 1
42 | https://future-architect.github.io/articles/20190713/ 1
43 | https://blog.craftz.dog/my-dev-workflow-using-tmux-vim-48f73cc4f39e 1
44 | https://junkyard.song.mu/slides/gocon2019-fukuoka/ 1
45 | https://nykergoto.hatenablog.jp/entry/2019/07/09/FFT_を使った時系列データ解析 1
46 | http://memorability.csail.mit.edu/index.html 1
47 | https://ynd.co/blog/tensorflow-vs-pytorch/ 1
48 | https://cloudplatform-jp.googleblog.com/2019/07/analyze-bigquery-data-with-kaggle-kernels-notebooks.html 1
49 | https://omedstu.jimdo.com/2019/07/05/force法によるrecurrent-spiking-neural-networksの教師あり学習/ 1
50 | https://buildersbox.corp-sansan.com/entry/2019/07/12/110000 1
51 | https://www.slideshare.net/shunsukekozawa5/gunosy-152302982 1
52 | https://ml-loft.connpass.com/event/136426/ 1
53 | https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/ 1
54 | https://blog.ml.cmu.edu/2019/08/02/regret-circuits-composability-of-regret-minimizers/ 1
55 | https://tech-blog.abeja.asia/entry/noisy-label-ml-survey 1
56 | https://imas.connpass.com/event/140272/ -1
57 | http://alissak.hatenablog.com/entry/2019/08/03/170413 -1
58 | https://ai.facebook.com/blog/advances-in-conversational-ai/ 1
59 | https://heartbeat.fritz.ai/a-2019-guide-to-semantic-segmentation-ca8242f5a7fc 1
60 | http://morningproject.hateblo.jp/entry/2019/08/03/112605 -1
61 | https://gendai.ismedia.jp/articles/-/66255 -1
62 | https://oreno-yuigon.hatenablog.com/entry/2019/08/02/143028 -1
63 | https://toyokeizai.net/articles/-/295714 -1
64 | https://www.jigowatt121.com/entry/2019/08/03/183756 -1
65 | https://shogi.zukeran.org/2019/07/31/konsen-1/ -1
66 | https://note.mu/futashika/n/n382a4780b8bd -1
67 | https://www.hotpepper.jp/mesitsu/entry/hiro-watanabe/19-00148 -1
68 | https://srdk.rakuten.jp/entry/2019/08/02/103000 -1
69 | https://blog.hatenablog.com/entry/2019/07/26/180000 -1
70 | https://www.huffingtonpost.jp/entry/oomura-conference_jp_5d454be5e4b0aca3411e2fe0 -1
71 | https://togetter.com/li/1383361 -1
72 | https://biz-journal.jp/2019/08/post_112573.html -1
73 | https://www.mofumofu.pink/entry/2019/08/03/144340 -1
74 | https://nlab.itmedia.co.jp/nl/articles/1908/03/news013.html -1
75 | https://behappy.pink/fedelini-alla-puttanesca/ -1
76 | https://helloandgoodbyecraft.com/jokes -1
77 | https://toyokeizai.net/articles/-/295293 -1
78 | https://sirabee.com/2019/07/31/20162134165/ -1
79 | https://www.around50-konkatsu.info/entry/2019/08/03/南の島へ現実逃避 -1
80 | https://www7.ikutanpapa.com/entry/taketei -1
81 | https://datarobot.connpass.com/event/209149/ 1
82 | https://www.mediatechnology.jp/entry/2021/03/31/160000 1
83 | https://openreview.net/forum?id=IrM64DGB21 1
84 | https://github.com/intel-isl/DPT 1
85 | https://blog.amedama.jp/entry/lgbm-data-size-vs-best-iters 1
86 | https://recruit.gmo.jp/engineer/jisedai/blog/vision_transformer/ 1
87 | https://data.gunosy.io/entry/deim2021 1
88 | https://logmi.jp/tech/articles/324141 1
89 | https://qiita.com/jovyan/items/c41ab61a6b04e9a6e4df 1
90 | https://github.com/manujosephv/pytorch_tabular 1
91 | https://tech.retrieva.jp/entry/2021/04/01/114943 1
92 | https://memo.sugyan.com/entry/2021/04/02/005434 1
93 | https://rooftop.cc/news/2021/03/31160000.php -1
94 | https://www.tokio.inc/s/tokio/ -1
95 | https://firego8.com/fire%e3%81%97%e3%81%be%e3%81%97%e3%81%9f%ef%bc%81 -1
96 | https://comic-days.com/episode/3269632237302594670 -1
97 | https://www.youtube.com/watch?v=oGEUZuicEYM -1
98 | https://www.youtube.com/watch?v=Cs_l0LIhg5M -1
99 | https://nlab.itmedia.co.jp/nl/articles/2104/01/news105.html -1
100 | https://animeanime.jp/article/2021/04/02/60531.html -1
101 | https://ja.kohsuke.org/%E3%82%BD%E3%83%95%E3%83%88%E3%82%A6%E3%82%A7%E3%82%A2%E9%96%8B%E7%99%BA/%E5%84%AA%E7%A7%80%E3%81%95%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6/ -1
102 | https://togetter.com/li/1691643 -1
103 | https://www.yasuhisay.info/entry/2021/02/12/090000 1
104 | https://www.yasuhisay.info/entry/2021/02/25/130000 1
105 | https://www.yasuhisay.info/entry/2021/03/12/114500 1
106 | https://www.yasuhisay.info/entry/2021/03/24/090000 1
107 | https://www.yasuhisay.info/entry/2021/03/25/083000 1
108 | https://www.yasuhisay.info/entry/2021/03/28/143000 1
109 | https://wapa5pow.com/posts/2021-03-31--day-one-in-project 1
110 | https://aws.amazon.com/jp/blogs/startup/tech-case-study-jp-startup-ai-ml/ 1
111 | https://nhiroki.jp/2021/03/31/design-docs 1
112 | https://dev.classmethod.jp/articles/ways-to-check-fargate-cpu-usage/ 1
113 | https://dev.classmethod.jp/articles/amazon-route-53-resolver-dns-firewall/ 1
114 | https://scrapbox.io/mizdra/chrome_devtools_%E3%81%AE_tips_N%E9%80%A3%E7%99%BA 1
115 | https://zenn.dev/saboyutaka/articles/07f1351a6b0049 1
116 |
--------------------------------------------------------------------------------