├── .github
    ├── dependabot.yml
    └── workflows
    │   └── build.yaml
├── LICENSE
├── Makefile
├── README.md
├── dbconfig.yml
├── go.mod
├── go.sum
├── lib
    ├── add
    │   ├── add.go
    │   └── add_test.go
    ├── annotation
    │   ├── annotation.go
    │   ├── annotation_cli.go
    │   └── annotation_slack.go
    ├── classifier
    │   ├── mira.go
    │   └── mira_test.go
    ├── command
    │   └── command.go
    ├── diagnosis
    │   ├── diagnosis.go
    │   ├── feature_weight
    │   │   ├── feature_weight.go
    │   │   └── feature_weight_test.go
    │   └── label_conflict
    │   │   ├── label_conflict.go
    │   │   └── label_conflict_test.go
    ├── evaluation
    │   ├── evaluation.go
    │   └── evaluation_test.go
    ├── example
    │   └── example.go
    ├── feature
    │   ├── example
    │   │   ├── example.go
    │   │   └── example_test.go
    │   ├── feature.go
    │   └── tweet
    │   │   ├── tweet.go
    │   │   └── tweet_test.go
    ├── fetcher
    │   ├── fetcher.go
    │   └── fetcher_test.go
    ├── hatena_bookmark
    │   ├── hatena_bookmark.go
    │   └── hatena_bookmark_test.go
    ├── model
    │   ├── error.go
    │   ├── example.go
    │   ├── hatena_bookmark.go
    │   ├── label_type.go
    │   ├── recommendation.go
    │   ├── related_example.go
    │   └── tweet.go
    ├── related_example
    │   └── related_example.go
    ├── repository
    │   ├── example.go
    │   ├── example_test.go
    │   ├── hatena_bookmark.go
    │   ├── hatena_bookmark_test.go
    │   ├── mira.go
    │   ├── mira_test.go
    │   ├── recommendation.go
    │   ├── recommendation_test.go
    │   ├── related_example.go
    │   ├── related_example_test.go
    │   ├── repository.go
    │   ├── top_accessed_example.go
    │   ├── top_accessed_example_test.go
    │   ├── tweet.go
    │   └── tweet_test.go
    ├── service
    │   ├── example.go
    │   ├── example_test.go
    │   └── service.go
    ├── top_accessed_example
    │   └── top_accessed_example.go
    └── util
    │   ├── converter
    │       └── converter.go
    │   ├── file
    │       ├── file.go
    │       └── file_test.go
    │   ├── util.go
    │   └── util_test.go
├── main.go
├── migrations
    ├── 0.sql
    ├── 1.sql
    ├── 10.sql
    ├── 11.sql
    ├── 12.sql
    ├── 13.sql
    ├── 14.sql
    ├── 15.sql
    ├── 16.sql
    ├── 2.sql
    ├── 3.sql
    ├── 4.sql
    ├── 5.sql
    ├── 6.sql
    ├── 7.sql
    ├── 8.sql
    └── 9.sql
├── script
    └── create_database.sql
└── tech_input_example.txt


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: gomod
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     time: "20:00"
 8 |   open-pull-requests-limit: 10
 9 |   reviewers:
10 |   - syou6162
11 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: build and test
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   build:
 6 |     name: build and test
 7 |     runs-on: ubuntu-latest
 8 |     services:
 9 |       postgres:
10 |         image: postgres:9.6
11 |         env:
12 |           POSTGRES_USER: nobody 
13 |           POSTGRES_PASSWORD: nobody 
14 |           POSTGRES_DB: go-active-learning-test
15 |         ports:
16 |           - 5432:5432
17 |         options: >-
18 |           --health-cmd pg_isready
19 |           --health-interval 10s
20 |           --health-timeout 5s
21 |           --health-retries 5
22 |           --name postgres
23 |     steps:
24 |       - name: checkout
25 |         uses: actions/checkout@v2
26 |       - name: format
27 |         run: test `gofmt -l $(git ls-files | grep -e '\.go$' | grep -v -e vendor) | wc -l` = 0
28 |       - name: deps
29 |         run: make deps
30 |       - name: build
31 |         run: make build
32 |       - name: test
33 |         run: |
34 |           export GOPATH=$HOME/go
35 |           export GOBIN=$(go env GOPATH)/bin
36 |           export PATH=$PATH:$GOPATH
37 |           export PATH=$PATH:$GOBIN
38 |           sql-migrate up -env=test
39 |           make cover
40 |           goveralls -coverprofile=${COVERAGE} -service=circle-ci -repotoken=${{ secrets.COVERALLS_TOKEN }}
41 |         env:
42 |           POSTGRES_HOST: localhost
43 |           POSTGRES_PORT: 5432        
44 |           POSTGRES_USER: nobody 
45 |           POSTGRES_PASSWORD: nobody
46 |           COVERAGE: coverage.out
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Yasuhisa Yoshida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | COVERAGE = coverage.out
 2 | export GO111MODULE := on
 3 | 
 4 | all: build
 5 | 
 6 | .PHONY: deps
 7 | deps:
 8 | 	go mod download
 9 | 	go get github.com/mattn/goveralls
10 | 	go get github.com/haya14busa/goverage
11 | 	go get github.com/rubenv/sql-migrate/sql-migrate
12 | 
13 | .PHONY: build
14 | build:
15 | 	go build -v
16 | 
17 | .PHONY: fmt
18 | fmt:
19 | 	gofmt -s -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor)
20 | 	goimports -w $$(git ls-files | grep -e '\.go$$' | grep -v -e vendor)
21 | 
22 | .PHONY: test
23 | test:
24 | 	DB_NAME=go-active-learning-test go test -v ./... -p 1 -count 1
25 | 
26 | .PHONY: vet
27 | vet:
28 | 	go tool vet --all *.go
29 | 
30 | .PHONY: test-all
31 | test-all: vet test
32 | 
33 | .PHONY: cover
34 | cover:
35 | 	DB_NAME=go-active-learning-test goverage -parallel 1 -v -coverprofile=${COVERAGE} ./...
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # go-active-learning
 2 | [![CircleCI](https://circleci.com/gh/syou6162/go-active-learning.svg?style=shield)](https://circleci.com/gh/syou6162/go-active-learning)
 3 | [![Go Report Card](https://goreportcard.com/badge/github.com/syou6162/go-active-learning)](https://goreportcard.com/report/github.com/syou6162/go-active-learning)
 4 | [![Coverage Status](https://coveralls.io/repos/github/syou6162/go-active-learning/badge.svg?branch=master)](https://coveralls.io/github/syou6162/go-active-learning?branch=master)
 5 | 
 6 | go-active-learning is a command line annotation tool for binary classification problem written in Go. It uses simple active learning algorithm to reduce annotation time.
 7 | 
 8 | # Install
 9 | 
10 | ```console
11 | % go get github.com/syou6162/go-active-learning
12 | ```
13 | 
14 | ## Build from source
15 | 
16 | ```console
17 | % git clone https://github.com/syou6162/go-active-learning.git
18 | % cd go-active-learning
19 | % createdb go-active-learning
20 | % createdb go-active-learning-test
21 | % sql-migrate up -env=local
22 | % sql-migrate up -env=test
23 | % make build
24 | ```
25 | 
26 | # Usage
27 | go-active-learning has `annotate` (annotate new examples suggested by active learning) mode and `diagnose` (check label conflicts in training data) mode. To see the detail options, type `./go-active-learning --help`.
28 | 
29 | ## Annotation model
30 | To see the detail options, type `./go-active-learning annotate --help`.
31 | 
32 | ## Annotate new examples from command line interface
33 | To see the detail options, type `./go-active-learning annotate cli --help`.
34 | 
35 | ```console
36 | % ./go-active-learning annotate cli --open-url
37 | Loading cache...
38 | Label this example (Score: 0.600): http://srdk.rakuten.jp/ (それどこ)
39 | 
40 | p: Label this example as positive.
41 | n: Label this example as negative.
42 | s: Skip this example.
43 | h: Show this help.
44 | e: Exit.
45 | 
46 | Label this example (Score: 1.000): http://srdk.rakuten.jp/ (それどこ)
47 | Labeled as negative
48 | ```
49 | 
50 | ## Annotate new examples from slack
51 | To see the detail options, type `./go-active-learning annotate cli --help`. To annotate new examples from slack, you need to create slack bot, and obtain token from [here](https://my.slack.com/services/new/bot). You can pass token via environmental variable (`SLACK_TOKEN`).
52 | 
53 | ```console
54 | % export SLACK_TOKEN=xoxb-SLACK-TOKEN
55 | % ./go-active-learning annotate slack --filter-status-code-ok --channel CHANNEL_ID
56 | ```
57 | 
58 | ## Diagnosis model
59 | To see the detail options, type `./go-active-learning diagnose --help`.
60 | 
61 | ### Diagnose training data
62 | This subcommand diagnoses label conflicts in training data. 'conflict' means that an annotated label is '-1/1', but a predicted label by model is '1/-1'. In the above example, `http://www3.nhk.or.jp/news/` is a conflict case ('Label' is -1, but 'Score' is positive). You may need to collect such news articles to train a good classifier.
63 | 
64 | ```console
65 | % ./go-active-learning diagnose label-conflict
66 | Loading cache...
67 | Index   Label   Score   URL     Title
68 | 0       -1      0.491   http://www3.nhk.or.jp/news/
69 | 1       1       0.491   http://blog.yuuk.io/
70 | 2       1       0.491   http://www.yasuhisay.info/
71 | 3       -1      -3.057  http://r.gnavi.co.jp/g-interview/       ぐるなび みんなのごはん
72 | 4       1       4.264   http://hakobe932.hatenablog.com/        hakobe-blog ♨
73 | 5       -1      -7.151  http://suumo.jp/town/   SUUMOタウン
74 | 6       -1      -26.321 https://www.facebook.com/       ログイン (日本語)
75 | 7       1       44.642  http://www.songmu.jp/riji/      おそらくはそれさえも平凡な日々
76 | 8       1       121.170 http://motemen.hatenablog.com/  詩と創作・思索のひろば
77 | Saving cache...
78 | ```
79 | 
80 | ### Diagnose feature weight
81 | This subcommand list pairs of feature weight and its name.
82 | 
83 | ```console
84 | % ./go-active-learning diagnose feature-weight --filter-status-code-ok | head -n 10
85 | +0.80   BODY:/
86 | +0.80   BODY:ほか
87 | +0.80   BODY:郁
88 | +0.80   BODY:単行本
89 | +0.80   BODY:姿
90 | +0.80   BODY:暗黙
91 | +0.80   BODY:創造
92 | +0.80   BODY:企業
93 | +0.80   BODY:野中
94 | +0.80   BODY:準備
95 | ```
96 | 
97 | # Author
98 | Yasuhisa Yoshida
99 | 


--------------------------------------------------------------------------------
/dbconfig.yml:
--------------------------------------------------------------------------------
 1 | test:
 2 |   dialect: postgres
 3 |   datasource: host=localhost user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} dbname=go-active-learning-test sslmode=disable
 4 |   dir: migrations
 5 | 
 6 | local:
 7 |   dialect: postgres
 8 |   datasource: host=localhost user=nobody password=nobody dbname=go-active-learning sslmode=disable
 9 |   dir: migrations
10 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/syou6162/go-active-learning
 2 | 
 3 | go 1.12
 4 | 
 5 | require (
 6 | 	github.com/PuerkitoBio/goquery v1.5.1
 7 | 	github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 // indirect
 8 | 	github.com/fatih/set v0.2.1 // indirect
 9 | 	github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect
10 | 	github.com/gorilla/websocket v1.4.0 // indirect
11 | 	github.com/ikawaha/kagome v1.11.2
12 | 	github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43 // indirect
13 | 	github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f
14 | 	github.com/jmoiron/sqlx v1.3.1
15 | 	github.com/lib/pq v1.10.0
16 | 	github.com/mackerelio/mackerel-client-go v0.16.0
17 | 	github.com/mattn/go-isatty v0.0.8 // indirect
18 | 	github.com/mattn/go-runewidth v0.0.4 // indirect
19 | 	github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859
20 | 	github.com/montanaflynn/stats v0.5.0 // indirect
21 | 	github.com/neurosnap/sentences v1.0.6 // indirect
22 | 	github.com/nlopes/slack v0.6.0
23 | 	github.com/olekukonko/tablewriter v0.0.1 // indirect
24 | 	github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4
25 | 	github.com/pkg/errors v0.9.1
26 | 	github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d // indirect
27 | 	github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
28 | 	github.com/stretchr/testify v1.3.0 // indirect
29 | 	github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f
30 | 	github.com/urfave/cli v1.22.5
31 | 	golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa // indirect
32 | 	golang.org/x/text v0.3.2 // indirect
33 | 	gopkg.in/neurosnap/sentences.v1 v1.0.6 // indirect
34 | 	gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0
35 | )
36 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
  2 | github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
  3 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
  4 | github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
  5 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
  6 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 h1:c4mLfegoDw6OhSJXTd2jUEQgZUQuJWtocudb97Qn9EM=
  7 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI=
  8 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
  9 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
 10 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 11 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 12 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 13 | github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA=
 14 | github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI=
 15 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I=
 16 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs=
 17 | github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs=
 18 | github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
 19 | github.com/gorilla/websocket v1.2.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
 20 | github.com/gorilla/websocket v1.4.0 h1:WDFjx/TMzVgy9VdMMQi2K2Emtwi2QcUQsztZ/zLaH/Q=
 21 | github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
 22 | github.com/ikawaha/kagome v1.11.2 h1:eCWpLqv5Euqa5JcwkaobUSy6uGM8rwwMw5Su3eRepBI=
 23 | github.com/ikawaha/kagome v1.11.2/go.mod h1:lHwhkGuuWqKWTxeQMppD0EmQAfKbc39QKx9qoWqgo+A=
 24 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43 h1:jTkyeF7NZ5oIr0ESmcrpiDgAfoidCBF4F5kJhjtaRwE=
 25 | github.com/jaytaylor/html2text v0.0.0-20190408195923-01ec452cbe43/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
 26 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f h1:AQ+AwWeEFf6NsjaMzhuVKLfxZH1+i7aoHuYXObQAzDo=
 27 | github.com/jdkato/prose v0.0.0-20181022194337-a179b97cfa6f/go.mod h1:jkF0lkxaX5PFSlk9l4Gh9Y+T57TqUZziWT7uZbW5ADg=
 28 | github.com/jmoiron/sqlx v1.3.1 h1:aLN7YINNZ7cYOPK3QC83dbM6KT0NMqVMw961TqrejlE=
 29 | github.com/jmoiron/sqlx v1.3.1/go.mod h1:2BljVx/86SuTyjE+aPYlHCTNvZrnJXghYGpNiXLBMCQ=
 30 | github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
 31 | github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 32 | github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
 33 | github.com/lib/pq v1.10.0 h1:Zx5DJFEYQXio93kgXnQ09fXNiUKsqv4OUEu2UtGcB1E=
 34 | github.com/lib/pq v1.10.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
 35 | github.com/mackerelio/mackerel-client-go v0.16.0 h1:9AoqOg+kX07QsVBGN8yD3Zx0skd+cGqESp7kXquDjDs=
 36 | github.com/mackerelio/mackerel-client-go v0.16.0/go.mod h1:/GNOj+y1eFsd3CK8c6IQ/uS38/GT0+NWImk5YGJs5Lk=
 37 | github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE=
 38 | github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
 39 | github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y=
 40 | github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
 41 | github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
 42 | github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
 43 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859 h1:smQbSzmT3EHl4EUwtFwFGmGIpiYgIiiPeVv1uguIQEE=
 44 | github.com/mattn/go-tty v0.0.0-20190424173100-523744f04859/go.mod h1:XPvLUNfbS4fJH25nqRHfWLMa1ONC8Amw+mIA639KxkE=
 45 | github.com/montanaflynn/stats v0.5.0 h1:2EkzeTSqBB4V4bJwWrt5gIIrZmpJBcoIRGS2kWLgzmk=
 46 | github.com/montanaflynn/stats v0.5.0/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
 47 | github.com/neurosnap/sentences v1.0.6 h1:iBVUivNtlwGkYsJblWV8GGVFmXzZzak907Ci8aA0VTE=
 48 | github.com/neurosnap/sentences v1.0.6/go.mod h1:pg1IapvYpWCJJm/Etxeh0+gtMf1rI1STY9S7eUCPbDc=
 49 | github.com/nlopes/slack v0.6.0 h1:jt0jxVQGhssx1Ib7naAOZEZcGdtIhTzkP0nopK0AsRA=
 50 | github.com/nlopes/slack v0.6.0/go.mod h1:JzQ9m3PMAqcpeCam7UaHSuBuupz7CmpjehYMayT6YOk=
 51 | github.com/olekukonko/tablewriter v0.0.1 h1:b3iUnf1v+ppJiOfNX4yxxqfWKMQPZR5yoh8urCTFX88=
 52 | github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo=
 53 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4 h1:49lOXmGaUpV9Fz3gd7TFZY106KVlPVa5jcYD1gaQf98=
 54 | github.com/pkg/browser v0.0.0-20180916011732-0a3d74bf9ce4/go.mod h1:4OwLy04Bl9Ef3GJJCoec+30X3LQs/0/m4HFRt/2LUSA=
 55 | github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 56 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 57 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 58 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 59 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 60 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
 61 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 62 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d h1:rUbV6LJa5RXK3jT/4jnJUz3UkrXzW6cqB+n9Fkbv9jY=
 63 | github.com/shogo82148/go-shuffle v0.0.0-20180218125048-27e6095f230d/go.mod h1:2htx6lmL0NGLHlO8ZCf+lQBGBHIbEujyywxJArf+2Yc=
 64 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
 65 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 66 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
 67 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
 68 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 69 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 70 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
 71 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 72 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f h1:KZTZZaZYr4F+0V3AUEs2ZvOGYFlUKFdAWt+CkyhC2Wc=
 73 | github.com/syou6162/GoOse v0.0.0-20190108170554-09969ebeb09f/go.mod h1:T2hVrnNfCW4aQcCS7ReyHEKMEZat4F+fxMCzBlf1Q8g=
 74 | github.com/urfave/cli v1.22.5 h1:lNq9sAHXK2qfdI8W+GRItjCEkI+2oR4d+MEHy1CKXoU=
 75 | github.com/urfave/cli v1.22.5/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
 76 | github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 77 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 78 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 79 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b h1:Wh+f8QHJXR411sJR8/vRBTZ7YapZaRvUcLFFJhusH0k=
 80 | golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 81 | golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 82 | golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ=
 83 | golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 84 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 85 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 86 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 87 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 88 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b h1:0mm1VjtFUOIlE1SbDlwjYaDxZVDP2S5ou6y0gSgXHu8=
 89 | golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 90 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 91 | golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 92 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 93 | golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 94 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 95 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa h1:KIDDMLT1O0Nr7TSxp8xM5tJcdn8tgyAONntO829og1M=
 96 | golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 97 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 98 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
 99 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
100 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
101 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
102 | golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
103 | golang.org/x/tools v0.0.0-20200612022331-742c5eb664c2 h1:DVqHa33CzfnTKwUV6be+I4hp31W6iXn3ZiEcdKGzLyI=
104 | golang.org/x/tools v0.0.0-20200612022331-742c5eb664c2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
105 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
106 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
107 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
108 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
109 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
110 | gopkg.in/neurosnap/sentences.v1 v1.0.6 h1:v7ElyP020iEZQONyLld3fHILHWOPs+ntzuQTNPkul8E=
111 | gopkg.in/neurosnap/sentences.v1 v1.0.6/go.mod h1:YlK+SN+fLQZj+kY3r8DkGDhDr91+S3JmTb5LSxFRQo0=
112 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0 h1:YY+ZVPsg2oJnV1rpzwIWtuCtQk71YFwuk47mMtjraN4=
113 | gopkg.in/vmarkovtsev/go-lcss.v1 v1.0.0-20181020221121-dfc501d07ea0/go.mod h1:6LhSPGi1OSJsWUQZridpjQXWEnDzw7EZAXSjc5SyF8A=
114 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
115 | 


--------------------------------------------------------------------------------
/lib/add/add.go:
--------------------------------------------------------------------------------
  1 | package add
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 	"time"
  7 | 
  8 | 	"os"
  9 | 
 10 | 	mkr "github.com/mackerelio/mackerel-client-go"
 11 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 12 | 	"github.com/syou6162/go-active-learning/lib/hatena_bookmark"
 13 | 	"github.com/syou6162/go-active-learning/lib/service"
 14 | 	"github.com/syou6162/go-active-learning/lib/util"
 15 | 	"github.com/syou6162/go-active-learning/lib/util/file"
 16 | 	"github.com/urfave/cli"
 17 | )
 18 | 
 19 | func doAdd(c *cli.Context) error {
 20 | 	inputFilename := c.String("input-filename")
 21 | 
 22 | 	if inputFilename == "" {
 23 | 		_ = cli.ShowCommandHelp(c, "add")
 24 | 		return cli.NewExitError("`input-filename` is a required field.", 1)
 25 | 	}
 26 | 
 27 | 	app, err := service.NewDefaultApp()
 28 | 	if err != nil {
 29 | 		return err
 30 | 	}
 31 | 	defer app.Close()
 32 | 
 33 | 	examples, err := file.ReadExamples(inputFilename)
 34 | 	if err != nil {
 35 | 		return err
 36 | 	}
 37 | 
 38 | 	if err := app.AttachMetadata(examples, 0, 0); err != nil {
 39 | 		return err
 40 | 	}
 41 | 
 42 | 	examples = util.FilterStatusCodeNotOkExamples(examples)
 43 | 	app.Fetch(examples)
 44 | 	examples = util.FilterStatusCodeOkExamples(examples)
 45 | 
 46 | 	m, err := app.FindLatestMIRAModel(classifier.EXAMPLE)
 47 | 	skipPredictScore := false
 48 | 	if err != nil {
 49 | 		log.Println(fmt.Sprintf("Error to load model %s", err.Error()))
 50 | 		skipPredictScore = true
 51 | 	}
 52 | 
 53 | 	for _, e := range examples {
 54 | 		if !skipPredictScore {
 55 | 			e.Score = m.PredictScore(e.Fv)
 56 | 		}
 57 | 		if e.CreatedAt.Before(time.Date(2000, 01, 01, 0, 0, 0, 0, time.Local)) {
 58 | 			log.Println(fmt.Sprintf("Skipin too old example: %s", e.Url))
 59 | 			continue
 60 | 		}
 61 | 		if err = app.UpdateOrCreateExample(e); err != nil {
 62 | 			log.Println(fmt.Sprintf("Error occured proccessing %s %s", e.Url, err.Error()))
 63 | 			continue
 64 | 		}
 65 | 		if err = app.UpdateFeatureVector(e); err != nil {
 66 | 			log.Println(fmt.Sprintf("Error occured proccessing %s feature vector %s", e.Url, err.Error()))
 67 | 			continue
 68 | 		}
 69 | 		if bookmark, err := hatena_bookmark.GetHatenaBookmark(e.FinalUrl); err == nil {
 70 | 			e.HatenaBookmark = bookmark
 71 | 			app.UpdateHatenaBookmark(e)
 72 | 		}
 73 | 	}
 74 | 
 75 | 	if err := postNumOfExamplesToMackerel(app); err != nil {
 76 | 		return err
 77 | 	}
 78 | 
 79 | 	return nil
 80 | }
 81 | 
 82 | func postNumOfExamplesToMackerel(app service.GoActiveLearningApp) error {
 83 | 	cnt, err := app.CountPositiveExamples()
 84 | 	if err != nil {
 85 | 		return err
 86 | 	}
 87 | 	if err := postNumOfExamplesByLabelToMackerel("count.positive", cnt); err != nil {
 88 | 		return err
 89 | 	}
 90 | 
 91 | 	cnt, err = app.CountNegativeExamples()
 92 | 	if err != nil {
 93 | 		return err
 94 | 	}
 95 | 	if err := postNumOfExamplesByLabelToMackerel("count.negative", cnt); err != nil {
 96 | 		return err
 97 | 	}
 98 | 
 99 | 	cnt, err = app.CountUnlabeledExamples()
100 | 	if err != nil {
101 | 		return err
102 | 	}
103 | 	if err := postNumOfExamplesByLabelToMackerel("count.unlabeled", cnt); err != nil {
104 | 		return err
105 | 	}
106 | 	return nil
107 | }
108 | 
109 | func postNumOfExamplesByLabelToMackerel(label string, cnt int) error {
110 | 	apiKey := os.Getenv("MACKEREL_APIKEY")
111 | 	serviceName := os.Getenv("MACKEREL_SERVICE_NAME")
112 | 	if apiKey == "" || serviceName == "" {
113 | 		return nil
114 | 	}
115 | 
116 | 	client := mkr.NewClient(apiKey)
117 | 	now := time.Now().Unix()
118 | 	err := client.PostServiceMetricValues(serviceName, []*mkr.MetricValue{
119 | 		{
120 | 			Name:  label,
121 | 			Time:  now,
122 | 			Value: cnt,
123 | 		},
124 | 	})
125 | 	return err
126 | }
127 | 
128 | var CommandAdd = cli.Command{
129 | 	Name:  "add",
130 | 	Usage: "add urls",
131 | 	Description: `
132 | Add urls.
133 | `,
134 | 	Action: doAdd,
135 | 	Flags: []cli.Flag{
136 | 		cli.StringFlag{Name: "input-filename"},
137 | 	},
138 | }
139 | 


--------------------------------------------------------------------------------
/lib/add/add_test.go:
--------------------------------------------------------------------------------
 1 | package add_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/command"
 7 | 	"github.com/urfave/cli"
 8 | )
 9 | 
10 | func TestDoAdd(t *testing.T) {
11 | 	app := cli.NewApp()
12 | 	app.Commands = command.Commands
13 | 	args := []string{
14 | 		"go-active-learning-web",
15 | 		"add",
16 | 		"--input-filename=../../tech_input_example.txt",
17 | 	}
18 | 
19 | 	if err := app.Run(args); err != nil {
20 | 		t.Error(err)
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/lib/annotation/annotation.go:
--------------------------------------------------------------------------------
 1 | package annotation
 2 | 
 3 | import (
 4 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 5 | 	"github.com/syou6162/go-active-learning/lib/model"
 6 | 	"github.com/urfave/cli"
 7 | )
 8 | 
 9 | type ActionType int
10 | 
11 | const (
12 | 	LABEL_AS_POSITIVE ActionType = iota
13 | 	LABEL_AS_NEGATIVE
14 | 	HELP
15 | 	SKIP
16 | 	EXIT
17 | )
18 | 
19 | func rune2ActionType(r rune) ActionType {
20 | 	switch r {
21 | 	case 'p':
22 | 		return LABEL_AS_POSITIVE
23 | 	case 'n':
24 | 		return LABEL_AS_NEGATIVE
25 | 	case 's':
26 | 		return SKIP
27 | 	case 'h':
28 | 		return HELP
29 | 	case 'e':
30 | 		return EXIT
31 | 	default:
32 | 		return HELP
33 | 	}
34 | }
35 | 
36 | func NextExampleToBeAnnotated(m classifier.MIRAClassifier, examples model.Examples) *model.Example {
37 | 	unlabeledExamples := m.SortByScore(examples)
38 | 	if len(unlabeledExamples) == 0 {
39 | 		return nil
40 | 	}
41 | 	e := unlabeledExamples[0]
42 | 	if e == nil {
43 | 		return nil
44 | 	}
45 | 	return e
46 | }
47 | 
48 | var ActionHelpDoc = `
49 | p: Label this example as positive.
50 | n: Label this example as negative.
51 | s: Skip this example.
52 | h: Show this help.
53 | e: Exit.
54 | `
55 | 
56 | var CommandAnnotate = cli.Command{
57 | 	Name:  "annotate",
58 | 	Usage: "Annotate URLs",
59 | 	Description: `
60 | Annotate URLs using active learning.
61 | `,
62 | 	Subcommands: []cli.Command{
63 | 		{
64 | 			Name:  "cli",
65 | 			Usage: "Annotate URLs using cli",
66 | 			Description: `
67 | Annotate URLs using active learning using cli.
68 | `,
69 | 			Action: doAnnotate,
70 | 			Flags: []cli.Flag{
71 | 				cli.BoolFlag{Name: "open-url", Usage: "Open url in background"},
72 | 				cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
73 | 				cli.BoolFlag{Name: "show-active-features"},
74 | 			},
75 | 		},
76 | 		{
77 | 			Name:  "slack",
78 | 			Usage: "Annotate URLs using slack",
79 | 			Description: `
80 | Annotate URLs using active learning using slack.
81 | `,
82 | 			Action: doAnnotateWithSlack,
83 | 			Flags: []cli.Flag{
84 | 				cli.StringFlag{Name: "channel"},
85 | 				cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
86 | 			},
87 | 		},
88 | 	},
89 | }
90 | 


--------------------------------------------------------------------------------
/lib/annotation/annotation_cli.go:
--------------------------------------------------------------------------------
  1 | package annotation
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 
  7 | 	"math"
  8 | 	"sort"
  9 | 
 10 | 	"github.com/mattn/go-tty"
 11 | 	"github.com/pkg/browser"
 12 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 13 | 	"github.com/syou6162/go-active-learning/lib/example"
 14 | 	"github.com/syou6162/go-active-learning/lib/model"
 15 | 	"github.com/syou6162/go-active-learning/lib/service"
 16 | 	"github.com/syou6162/go-active-learning/lib/util"
 17 | 	"github.com/syou6162/go-active-learning/lib/util/converter"
 18 | 	"github.com/urfave/cli"
 19 | )
 20 | 
 21 | func input2ActionType() (ActionType, error) {
 22 | 	t, err := tty.Open()
 23 | 	defer t.Close()
 24 | 	if err != nil {
 25 | 		return EXIT, err
 26 | 	}
 27 | 	var r rune
 28 | 	for r == 0 {
 29 | 		r, err = t.ReadRune()
 30 | 		if err != nil {
 31 | 			return HELP, err
 32 | 		}
 33 | 	}
 34 | 	return rune2ActionType(r), nil
 35 | }
 36 | 
 37 | func doAnnotate(c *cli.Context) error {
 38 | 	openUrl := c.Bool("open-url")
 39 | 	filterStatusCodeOk := c.Bool("filter-status-code-ok")
 40 | 	showActiveFeatures := c.Bool("show-active-features")
 41 | 
 42 | 	app, err := service.NewDefaultApp()
 43 | 	if err != nil {
 44 | 		return err
 45 | 	}
 46 | 	defer app.Close()
 47 | 
 48 | 	examples, err := app.SearchExamples()
 49 | 	if err != nil {
 50 | 		return err
 51 | 	}
 52 | 
 53 | 	stat := example.GetStat(examples)
 54 | 	fmt.Fprintln(os.Stderr, fmt.Sprintf("Positive:%d, Negative:%d, Unlabeled:%d", stat["positive"], stat["negative"], stat["unlabeled"]))
 55 | 
 56 | 	app.Fetch(examples)
 57 | 	for _, e := range examples {
 58 | 		app.UpdateFeatureVector(e)
 59 | 	}
 60 | 	if filterStatusCodeOk {
 61 | 		examples = util.FilterStatusCodeOkExamples(examples)
 62 | 	}
 63 | 
 64 | 	m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
 65 | 	if err != nil {
 66 | 		return err
 67 | 	}
 68 | 
 69 | annotationLoop:
 70 | 	for {
 71 | 		e := NextExampleToBeAnnotated(*m, examples)
 72 | 		if e == nil {
 73 | 			fmt.Println("No example")
 74 | 			break annotationLoop
 75 | 		}
 76 | 		fmt.Println("Label this example (Score: " + fmt.Sprintf("%+0.03f", e.Score) + "): " + e.Url + " (" + e.Title + ")")
 77 | 
 78 | 		if openUrl {
 79 | 			browser.OpenURL(e.Url)
 80 | 		}
 81 | 		if showActiveFeatures {
 82 | 			ShowActiveFeatures(*m, *e, 5)
 83 | 		}
 84 | 
 85 | 		act, err := input2ActionType()
 86 | 		if err != nil {
 87 | 			return err
 88 | 		}
 89 | 		switch act {
 90 | 		case LABEL_AS_POSITIVE:
 91 | 			fmt.Println("Labeled as positive")
 92 | 			e.Annotate(model.POSITIVE)
 93 | 			app.UpdateOrCreateExample(e)
 94 | 		case LABEL_AS_NEGATIVE:
 95 | 			fmt.Println("Labeled as negative")
 96 | 			e.Annotate(model.NEGATIVE)
 97 | 			app.UpdateOrCreateExample(e)
 98 | 		case SKIP:
 99 | 			fmt.Println("Skiped this example")
100 | 			examples = util.RemoveExample(examples, *e)
101 | 			continue
102 | 		case HELP:
103 | 			fmt.Println(ActionHelpDoc)
104 | 		case EXIT:
105 | 			fmt.Println("EXIT")
106 | 			break annotationLoop
107 | 		default:
108 | 			break annotationLoop
109 | 		}
110 | 
111 | 		m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
112 | 		if err != nil {
113 | 			return err
114 | 		}
115 | 	}
116 | 
117 | 	return nil
118 | }
119 | 
120 | type FeatureWeightPair struct {
121 | 	Feature string
122 | 	Weight  float64
123 | }
124 | 
125 | type FeatureWeightPairs []FeatureWeightPair
126 | 
127 | func SortedActiveFeatures(model classifier.MIRAClassifier, example model.Example, n int) FeatureWeightPairs {
128 | 	pairs := FeatureWeightPairs{}
129 | 	for _, f := range example.Fv {
130 | 		pairs = append(pairs, FeatureWeightPair{f, model.GetWeight(f)})
131 | 	}
132 | 	sort.Sort(sort.Reverse(pairs))
133 | 
134 | 	result := FeatureWeightPairs{}
135 | 	cnt := 0
136 | 	for _, pair := range pairs {
137 | 		if cnt >= n {
138 | 			break
139 | 		}
140 | 		if (example.Score > 0.0 && pair.Weight > 0.0) || (example.Score < 0.0 && pair.Weight < 0.0) {
141 | 			result = append(result, pair)
142 | 			cnt++
143 | 		}
144 | 	}
145 | 	return result
146 | }
147 | 
148 | func ShowActiveFeatures(model classifier.MIRAClassifier, example model.Example, n int) {
149 | 	for _, pair := range SortedActiveFeatures(model, example, n) {
150 | 		fmt.Println(fmt.Sprintf("%+0.1f %s", pair.Weight, pair.Feature))
151 | 	}
152 | }
153 | 
154 | func (slice FeatureWeightPairs) Len() int {
155 | 	return len(slice)
156 | }
157 | 
158 | func (slice FeatureWeightPairs) Less(i, j int) bool {
159 | 	return math.Abs(slice[i].Weight) < math.Abs(slice[j].Weight)
160 | }
161 | 
162 | func (slice FeatureWeightPairs) Swap(i, j int) {
163 | 	slice[i], slice[j] = slice[j], slice[i]
164 | }
165 | 


--------------------------------------------------------------------------------
/lib/annotation/annotation_slack.go:
--------------------------------------------------------------------------------
  1 | package annotation
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 
  7 | 	"github.com/nlopes/slack"
  8 | 	"github.com/pkg/errors"
  9 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 10 | 	"github.com/syou6162/go-active-learning/lib/example"
 11 | 	"github.com/syou6162/go-active-learning/lib/model"
 12 | 	"github.com/syou6162/go-active-learning/lib/service"
 13 | 	"github.com/syou6162/go-active-learning/lib/util"
 14 | 	"github.com/syou6162/go-active-learning/lib/util/converter"
 15 | 	"github.com/urfave/cli"
 16 | )
 17 | 
 18 | func doAnnotateWithSlack(c *cli.Context) error {
 19 | 	channelID := c.String("channel")
 20 | 	filterStatusCodeOk := c.Bool("filter-status-code-ok")
 21 | 
 22 | 	if channelID == "" {
 23 | 		_ = cli.ShowCommandHelp(c, "slack")
 24 | 		return cli.NewExitError("`channel` is a required field.", 1)
 25 | 	}
 26 | 
 27 | 	api := slack.New(os.Getenv("SLACK_TOKEN"))
 28 | 	rtm := api.NewRTM()
 29 | 	go rtm.ManageConnection()
 30 | 
 31 | 	app, err := service.NewDefaultApp()
 32 | 	if err != nil {
 33 | 		return err
 34 | 	}
 35 | 	defer app.Close()
 36 | 
 37 | 	examples, err := app.SearchExamples()
 38 | 	if err != nil {
 39 | 		return err
 40 | 	}
 41 | 
 42 | 	stat := example.GetStat(examples)
 43 | 	msg := rtm.NewOutgoingMessage(fmt.Sprintf("Positive:%d, Negative:%d, Unlabeled:%d", stat["positive"], stat["negative"], stat["unlabeled"]), channelID)
 44 | 	rtm.SendMessage(msg)
 45 | 
 46 | 	app.Fetch(examples)
 47 | 	for _, e := range examples {
 48 | 		app.UpdateFeatureVector(e)
 49 | 	}
 50 | 	if filterStatusCodeOk {
 51 | 		examples = util.FilterStatusCodeOkExamples(examples)
 52 | 	}
 53 | 
 54 | 	m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
 55 | 	if err != nil {
 56 | 		return err
 57 | 	}
 58 | 	e := NextExampleToBeAnnotated(*m, examples)
 59 | 	if e == nil {
 60 | 		return errors.New("No e to annotate")
 61 | 	}
 62 | 
 63 | 	rtm.SendMessage(rtm.NewOutgoingMessage("Ready to annotate!", channelID))
 64 | 	showExample(rtm, *m, e, channelID)
 65 | 	prevTimestamp := ""
 66 | 
 67 | annotationLoop:
 68 | 	for {
 69 | 		select {
 70 | 		case msg := <-rtm.IncomingEvents:
 71 | 			switch ev := msg.Data.(type) {
 72 | 			case *slack.AckMessage:
 73 | 				prevTimestamp = ev.Timestamp
 74 | 			case *slack.MessageEvent:
 75 | 				if ev.Channel != channelID {
 76 | 					break
 77 | 				}
 78 | 				text := ev.Text
 79 | 				if len(text) > 1 || len(text) == 0 {
 80 | 					break
 81 | 				}
 82 | 				r := []rune(text)[0]
 83 | 				act := rune2ActionType(r)
 84 | 
 85 | 				switch act {
 86 | 				case LABEL_AS_POSITIVE:
 87 | 					e.Annotate(model.POSITIVE)
 88 | 					m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
 89 | 					if err != nil {
 90 | 						return err
 91 | 					}
 92 | 					rtm.AddReaction("heavy_plus_sign", slack.NewRefToMessage(channelID, prevTimestamp))
 93 | 				case LABEL_AS_NEGATIVE:
 94 | 					e.Annotate(model.NEGATIVE)
 95 | 					m, err = classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(examples))
 96 | 					if err != nil {
 97 | 						return err
 98 | 					}
 99 | 					rtm.AddReaction("heavy_minus_sign", slack.NewRefToMessage(channelID, prevTimestamp))
100 | 				case SKIP:
101 | 					rtm.SendMessage(rtm.NewOutgoingMessage("Skiped this e", channelID))
102 | 					examples = util.RemoveExample(examples, *e)
103 | 					break
104 | 				case HELP:
105 | 					rtm.SendMessage(rtm.NewOutgoingMessage(ActionHelpDoc, channelID))
106 | 				case EXIT:
107 | 					rtm.SendMessage(rtm.NewOutgoingMessage("EXIT", channelID))
108 | 					break annotationLoop
109 | 				default:
110 | 					break annotationLoop
111 | 				}
112 | 				e = NextExampleToBeAnnotated(*m, examples)
113 | 				if e == nil {
114 | 					return errors.New("No e to annotate")
115 | 				}
116 | 				showExample(rtm, *m, e, channelID)
117 | 			case *slack.InvalidAuthEvent:
118 | 				return errors.New("Invalid credentials")
119 | 			default:
120 | 			}
121 | 		}
122 | 	}
123 | 	return nil
124 | }
125 | 
126 | func showExample(rtm *slack.RTM, model classifier.MIRAClassifier, example *model.Example, channelID string) {
127 | 	activeFeaturesStr := "Active Features: "
128 | 	for _, pair := range SortedActiveFeatures(model, *example, 5) {
129 | 		activeFeaturesStr += fmt.Sprintf("%s(%+0.1f) ", pair.Feature, pair.Weight)
130 | 	}
131 | 	rtm.SendMessage(rtm.NewOutgoingMessage(fmt.Sprintf("%s\nScore: %+0.2f\n%s", example.Url, example.Score, activeFeaturesStr), channelID))
132 | }
133 | 


--------------------------------------------------------------------------------
/lib/classifier/mira.go:
--------------------------------------------------------------------------------
  1 | package classifier
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"math/rand"
  7 | 	"os"
  8 | 	"runtime"
  9 | 	"sort"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/pkg/errors"
 13 | 	"github.com/syou6162/go-active-learning/lib/evaluation"
 14 | 	"github.com/syou6162/go-active-learning/lib/feature"
 15 | 	"github.com/syou6162/go-active-learning/lib/model"
 16 | 	"github.com/syou6162/go-active-learning/lib/util"
 17 | )
 18 | 
 19 | type ModelType int
 20 | 
 21 | const (
 22 | 	EXAMPLE ModelType = 0
 23 | 	TWITTER ModelType = 1
 24 | )
 25 | 
 26 | type MIRAClassifier struct {
 27 | 	ModelType ModelType          `json:"ModelType"`
 28 | 	Weight    map[string]float64 `json:"Weight"`
 29 | 	C         float64            `json:"C"`
 30 | 	Accuracy  float64            `json:"Accuracy"`
 31 | 	Precision float64            `json:"Precision"`
 32 | 	Recall    float64            `json:"Recall"`
 33 | 	Fvalue    float64            `json:"Fvalue"`
 34 | }
 35 | 
 36 | type LearningInstance interface {
 37 | 	GetFeatureVector() feature.FeatureVector
 38 | 	GetLabel() model.LabelType
 39 | }
 40 | 
 41 | type LearningInstances []LearningInstance
 42 | 
 43 | var errNoTrainingInstances = errors.New("Empty training set")
 44 | var errNoDevelopmentInstances = errors.New("Empty development set")
 45 | var errNoMIRAModelLearned = errors.New("Fail to learn MIRA models")
 46 | var errModelEvaluationFailure = errors.New("Failed to evaluate best MIRA")
 47 | var errTrainingInstancesAllPositive = errors.New("Labels of training instances are all positive")
 48 | var errTrainingInstancesAllNegative = errors.New("Labels of training instances are all negative")
 49 | var errDevelopmentInstancesAllPositive = errors.New("Labels of development instances are all positive")
 50 | var errDevelopmentInstancesAllNegative = errors.New("Labels of development instances are all negative")
 51 | 
 52 | func newMIRAClassifier(modelType ModelType, c float64) *MIRAClassifier {
 53 | 	return &MIRAClassifier{
 54 | 		ModelType: modelType,
 55 | 		Weight:    make(map[string]float64),
 56 | 		C:         c,
 57 | 		Accuracy:  0.0,
 58 | 		Precision: 0.0,
 59 | 		Recall:    0.0,
 60 | 		Fvalue:    0.0,
 61 | 	}
 62 | }
 63 | 
 64 | func filterLabeledInstances(instances LearningInstances) LearningInstances {
 65 | 	var result LearningInstances
 66 | 	for _, i := range instances {
 67 | 		if i.GetLabel() != 0 {
 68 | 			result = append(result, i)
 69 | 		}
 70 | 	}
 71 | 	return result
 72 | }
 73 | 
 74 | func shuffle(instances LearningInstances) {
 75 | 	n := len(instances)
 76 | 	for i := n - 1; i >= 0; i-- {
 77 | 		j := rand.Intn(i + 1)
 78 | 		instances[i], instances[j] = instances[j], instances[i]
 79 | 	}
 80 | }
 81 | 
 82 | func splitTrainAndDev(instances LearningInstances) (train LearningInstances, dev LearningInstances) {
 83 | 	shuffle(instances)
 84 | 	n := int(0.8 * float64(len(instances)))
 85 | 	return instances[0:n], instances[n:]
 86 | }
 87 | 
 88 | func NewMIRAClassifier(modelType ModelType, instances LearningInstances, c float64) *MIRAClassifier {
 89 | 	train := filterLabeledInstances(instances)
 90 | 	model := newMIRAClassifier(modelType, c)
 91 | 	for iter := 0; iter < 30; iter++ {
 92 | 		shuffle(train)
 93 | 		for _, example := range train {
 94 | 			model.learn(example)
 95 | 		}
 96 | 	}
 97 | 	return model
 98 | }
 99 | 
100 | func overSamplingPositiveExamples(instances LearningInstances) LearningInstances {
101 | 	overSampled := LearningInstances{}
102 | 	posInstances := LearningInstances{}
103 | 	negInstances := LearningInstances{}
104 | 
105 | 	numNeg := 0
106 | 
107 | 	for _, i := range instances {
108 | 		if i.GetLabel() == model.NEGATIVE {
109 | 			numNeg += 1
110 | 			negInstances = append(negInstances, i)
111 | 		} else if i.GetLabel() == model.POSITIVE {
112 | 			posInstances = append(posInstances, i)
113 | 		}
114 | 	}
115 | 
116 | 	for len(overSampled) <= numNeg {
117 | 		shuffle(posInstances)
118 | 		overSampled = append(overSampled, posInstances[0])
119 | 	}
120 | 	overSampled = append(overSampled, negInstances...)
121 | 	shuffle(overSampled)
122 | 
123 | 	return overSampled
124 | }
125 | 
126 | func extractGoldLabels(instances LearningInstances) []model.LabelType {
127 | 	golds := make([]model.LabelType, 0, 0)
128 | 	for _, i := range instances {
129 | 		golds = append(golds, i.GetLabel())
130 | 	}
131 | 	return golds
132 | }
133 | 
134 | type MIRAClassifierList []MIRAClassifier
135 | 
136 | func (l MIRAClassifierList) Len() int           { return len(l) }
137 | func (l MIRAClassifierList) Less(i, j int) bool { return l[i].Fvalue < l[j].Fvalue }
138 | func (l MIRAClassifierList) Swap(i, j int)      { l[i], l[j] = l[j], l[i] }
139 | 
140 | func allSameLabel(instances LearningInstances, label model.LabelType) bool {
141 | 	for _, instance := range instances {
142 | 		if instance.GetLabel() != label {
143 | 			return false
144 | 		}
145 | 	}
146 | 	return true
147 | }
148 | 
149 | func isValidTrainAndDevelopmentInstances(train LearningInstances, dev LearningInstances) (bool, error) {
150 | 	if len(train) == 0 {
151 | 		return false, errNoTrainingInstances
152 | 	}
153 | 	if len(dev) == 0 {
154 | 		return false, errNoDevelopmentInstances
155 | 	}
156 | 
157 | 	if allSameLabel(train, model.POSITIVE) {
158 | 		return false, errTrainingInstancesAllPositive
159 | 	}
160 | 	if allSameLabel(train, model.NEGATIVE) {
161 | 		return false, errTrainingInstancesAllNegative
162 | 	}
163 | 	if allSameLabel(dev, model.POSITIVE) {
164 | 		return false, errDevelopmentInstancesAllPositive
165 | 	}
166 | 	if allSameLabel(dev, model.NEGATIVE) {
167 | 		return false, errDevelopmentInstancesAllNegative
168 | 	}
169 | 
170 | 	return true, nil
171 | }
172 | 
173 | func NewMIRAClassifierByCrossValidation(modelType ModelType, instances LearningInstances) (*MIRAClassifier, error) {
174 | 	shuffle(instances)
175 | 	train, dev := splitTrainAndDev(filterLabeledInstances(instances))
176 | 	if valid, err := isValidTrainAndDevelopmentInstances(train, dev); !valid {
177 | 		return nil, err
178 | 	}
179 | 
180 | 	train = overSamplingPositiveExamples(train)
181 | 
182 | 	params := []float64{1000, 500, 100, 50, 10.0, 5.0, 1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001}
183 | 	miraResults := MIRAClassifierList{}
184 | 
185 | 	wg := &sync.WaitGroup{}
186 | 	cpus := runtime.NumCPU()
187 | 	runtime.GOMAXPROCS(cpus)
188 | 
189 | 	models := make([]*MIRAClassifier, len(params))
190 | 	for idx, c := range params {
191 | 		wg.Add(1)
192 | 		go func(idx int, c float64) {
193 | 			defer wg.Done()
194 | 			model := NewMIRAClassifier(modelType, train, c)
195 | 			models[idx] = model
196 | 		}(idx, c)
197 | 	}
198 | 	wg.Wait()
199 | 
200 | 	if len(models) == 0 {
201 | 		return nil, errNoMIRAModelLearned
202 | 	}
203 | 
204 | 	maxFvalue := math.Inf(-1)
205 | 	for _, m := range models {
206 | 		devPredicts := make([]model.LabelType, len(dev))
207 | 		for i, instance := range dev {
208 | 			devPredicts[i] = m.Predict(instance.GetFeatureVector())
209 | 		}
210 | 		m.Accuracy = evaluation.GetAccuracy(extractGoldLabels(dev), devPredicts)
211 | 		m.Precision = evaluation.GetPrecision(extractGoldLabels(dev), devPredicts)
212 | 		m.Recall = evaluation.GetRecall(extractGoldLabels(dev), devPredicts)
213 | 		m.Fvalue = (2 * m.Recall * m.Precision) / (m.Recall + m.Precision)
214 | 		fmt.Fprintln(os.Stderr, fmt.Sprintf("C:%0.03f\tAccuracy:%0.03f\tPrecision:%0.03f\tRecall:%0.03f\tF-value:%0.03f", m.C, m.Accuracy, m.Precision, m.Recall, m.Fvalue))
215 | 		tp, fp, fn, tn := evaluation.GetConfusionMatrix(extractGoldLabels(dev), devPredicts)
216 | 		fmt.Fprintln(os.Stderr, fmt.Sprintf("tp:%d\tfp:%d\tfn:%d\ttn:%d", tp, fp, fn, tn))
217 | 		if math.IsNaN(m.Fvalue) {
218 | 			continue
219 | 		}
220 | 		miraResults = append(miraResults, *m)
221 | 		if m.Fvalue >= maxFvalue {
222 | 			maxFvalue = m.Fvalue
223 | 		}
224 | 	}
225 | 	if len(miraResults) == 0 {
226 | 		return nil, errModelEvaluationFailure
227 | 	}
228 | 
229 | 	sort.Sort(sort.Reverse(miraResults))
230 | 	bestModel := &miraResults[0]
231 | 	instances = overSamplingPositiveExamples(instances)
232 | 	shuffle(instances)
233 | 	result := NewMIRAClassifier(modelType, filterLabeledInstances(instances), bestModel.C)
234 | 	result.Accuracy = bestModel.Accuracy
235 | 	result.Precision = bestModel.Precision
236 | 	result.Recall = bestModel.Recall
237 | 	result.Fvalue = bestModel.Fvalue
238 | 	return result, nil
239 | }
240 | 
241 | func (m *MIRAClassifier) learn(instance LearningInstance) {
242 | 	tmp := float64(instance.GetLabel()) * m.PredictScore(instance.GetFeatureVector()) // y w^T x
243 | 	loss := 0.0
244 | 	if tmp < 1.0 {
245 | 		loss = 1 - tmp
246 | 	}
247 | 
248 | 	norm := float64(len(instance.GetFeatureVector()) * len(instance.GetFeatureVector()))
249 | 	// tau := math.Min(m.C, loss/norm) // update by PA-I
250 | 	tau := loss / (norm + 1.0/m.C) // update by PA-II
251 | 
252 | 	if tau != 0.0 {
253 | 		for _, f := range instance.GetFeatureVector() {
254 | 			w, _ := m.Weight[f]
255 | 			m.Weight[f] = w + tau*float64(instance.GetLabel())
256 | 		}
257 | 	}
258 | }
259 | 
260 | func (m MIRAClassifier) PredictScore(features feature.FeatureVector) float64 {
261 | 	result := 0.0
262 | 	for _, f := range features {
263 | 		w, ok := m.Weight[f]
264 | 		if ok {
265 | 			result = result + w*1.0
266 | 		}
267 | 	}
268 | 	return result
269 | }
270 | 
271 | func (m MIRAClassifier) Predict(features feature.FeatureVector) model.LabelType {
272 | 	if m.PredictScore(features) > 0 {
273 | 		return model.POSITIVE
274 | 	}
275 | 	return model.NEGATIVE
276 | }
277 | 
278 | func (m MIRAClassifier) SortByScore(examples model.Examples) model.Examples {
279 | 	var unlabeledExamples model.Examples
280 | 	for _, e := range util.FilterUnlabeledExamples(examples) {
281 | 		e.Score = m.PredictScore(e.Fv)
282 | 		if !e.IsLabeled() && e.Score != 0.0 {
283 | 			unlabeledExamples = append(unlabeledExamples, e)
284 | 		}
285 | 	}
286 | 
287 | 	sort.Sort(unlabeledExamples)
288 | 	return unlabeledExamples
289 | }
290 | 
291 | func (m MIRAClassifier) GetWeight(f string) float64 {
292 | 	w, ok := m.Weight[f]
293 | 	if ok {
294 | 		return w
295 | 	}
296 | 	return 0.0
297 | }
298 | 
299 | func (m MIRAClassifier) GetActiveFeatures() []string {
300 | 	result := make([]string, 0)
301 | 	for f := range m.Weight {
302 | 		result = append(result, f)
303 | 	}
304 | 	return result
305 | }
306 | 


--------------------------------------------------------------------------------
/lib/classifier/mira_test.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/example"
 7 | 	"github.com/syou6162/go-active-learning/lib/model"
 8 | )
 9 | 
10 | func TestPredictScore(t *testing.T) {
11 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
12 | 	e1.Title = "bookmark"
13 | 	e1.Fv = []string{"hoge", "fuga"}
14 | 	e2 := example.NewExample("http://google.com", model.NEGATIVE)
15 | 	e2.Title = "google"
16 | 	e2.Fv = []string{"piyo", "aaa"}
17 | 	e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE)
18 | 	e3.Title = "hatena"
19 | 	e3.Fv = []string{"hoge", "fuga"}
20 | 	e4 := example.NewExample("http://hogehoge.com", model.UNLABELED)
21 | 	e4.Title = "hogehoge"
22 | 	e4.Fv = []string{"piyo", "hoge"}
23 | 
24 | 	examples := LearningInstances{e1, e2, e3, e4}
25 | 	c := NewMIRAClassifier(EXAMPLE, examples, 1.0)
26 | 
27 | 	if c.PredictScore(e4.Fv) < 0.0 {
28 | 		t.Errorf("c.PredictScore(e4.Fv) == %f, want >= 0", c.PredictScore(e4.Fv))
29 | 	}
30 | }
31 | 
32 | func TestSplitTrainAndDev(t *testing.T) {
33 | 	e1 := example.NewExample("http://a.hatena.ne.jp", model.POSITIVE)
34 | 	e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
35 | 	e3 := example.NewExample("http://google.com", model.UNLABELED)
36 | 	e4 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE)
37 | 	e5 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
38 | 	e6 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE)
39 | 	e7 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
40 | 	e8 := example.NewExample("http://google.com", model.UNLABELED)
41 | 	e9 := example.NewExample("https://a.hatena.ne.jp", model.POSITIVE)
42 | 	e10 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
43 | 
44 | 	train, dev := splitTrainAndDev(LearningInstances{e1, e2, e3, e4, e5, e6, e7, e8, e9, e10})
45 | 	if len(train) != 8 {
46 | 		t.Error("Number of training examples should be 8")
47 | 	}
48 | 	if len(dev) != 2 {
49 | 		t.Error("Number of dev examples should be 2")
50 | 	}
51 | }
52 | 
53 | func TestGetWeight(t *testing.T) {
54 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
55 | 	e1.Title = "bookmark"
56 | 	e1.Fv = []string{"hoge", "fuga"}
57 | 	e2 := example.NewExample("http://google.com", model.NEGATIVE)
58 | 	e2.Title = "google"
59 | 	e2.Fv = []string{"piyo", "aaa"}
60 | 	e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE)
61 | 	e3.Title = "hatena"
62 | 	e3.Fv = []string{"hoge", "fuga"}
63 | 	e4 := example.NewExample("http://hogehoge.com", model.UNLABELED)
64 | 	e4.Title = "hogehoge"
65 | 	e4.Fv = []string{"piyo", "hoge"}
66 | 
67 | 	examples := LearningInstances{e1, e2, e3, e4}
68 | 	c := NewMIRAClassifier(EXAMPLE, examples, 1.0)
69 | 
70 | 	if c.GetWeight("hoge") <= 0.0 {
71 | 		t.Errorf("c.GetWeight('hoge') == %f, want > 0", c.GetWeight("hoge"))
72 | 	}
73 | }
74 | 
75 | func TestGetActiveFeatures(t *testing.T) {
76 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
77 | 	e1.Title = "bookmark"
78 | 	e1.Fv = []string{"hoge", "fuga"}
79 | 	e2 := example.NewExample("http://google.com", model.NEGATIVE)
80 | 	e2.Title = "google"
81 | 	e2.Fv = []string{"piyo", "aaa"}
82 | 	e3 := example.NewExample("https://www.hatena.ne.jp", model.POSITIVE)
83 | 	e3.Title = "hatena"
84 | 	e3.Fv = []string{"hoge", "fuga"}
85 | 	e4 := example.NewExample("http://hogehoge.com", model.UNLABELED)
86 | 	e4.Title = "hogehoge"
87 | 	e4.Fv = []string{"piyo", "hoge"}
88 | 
89 | 	examples := LearningInstances{e1, e2, e3, e4}
90 | 	c := NewMIRAClassifier(EXAMPLE, examples, 1.0)
91 | 
92 | 	if len(c.GetActiveFeatures()) <= 0 {
93 | 		t.Errorf("len(c.GetActiveFeatures()) <= %d, want > 0", len(c.GetActiveFeatures()))
94 | 	}
95 | }
96 | 


--------------------------------------------------------------------------------
/lib/command/command.go:
--------------------------------------------------------------------------------
 1 | package command
 2 | 
 3 | import (
 4 | 	"github.com/syou6162/go-active-learning/lib/add"
 5 | 	"github.com/syou6162/go-active-learning/lib/annotation"
 6 | 	"github.com/syou6162/go-active-learning/lib/diagnosis"
 7 | 	"github.com/syou6162/go-active-learning/lib/related_example"
 8 | 	"github.com/syou6162/go-active-learning/lib/top_accessed_example"
 9 | 	"github.com/urfave/cli"
10 | )
11 | 
12 | var Commands = []cli.Command{
13 | 	add.CommandAdd,
14 | 	related_example.CommandAddRelatedExamples,
15 | 	annotation.CommandAnnotate,
16 | 	top_accessed_example.CommandAddTopAccessedExamples,
17 | 	diagnosis.CommandDiagnose,
18 | }
19 | 


--------------------------------------------------------------------------------
/lib/diagnosis/diagnosis.go:
--------------------------------------------------------------------------------
 1 | package diagnosis
 2 | 
 3 | import (
 4 | 	featureweight "github.com/syou6162/go-active-learning/lib/diagnosis/feature_weight"
 5 | 	labelconflict "github.com/syou6162/go-active-learning/lib/diagnosis/label_conflict"
 6 | 	"github.com/urfave/cli"
 7 | )
 8 | 
 9 | var CommandDiagnose = cli.Command{
10 | 	Name:  "diagnose",
11 | 	Usage: "Diagnose training data or learned model",
12 | 	Description: `
13 | Diagnose training data or learned model. This mode has two subcommand: label-conflict and feature-weight.
14 | `,
15 | 
16 | 	Subcommands: []cli.Command{
17 | 		{
18 | 			Name:  "label-conflict",
19 | 			Usage: "Diagnose label conflicts in training data",
20 | 			Description: `
21 | Diagnose label conflicts in training data. 'conflict' means that an annotated label is '-1/1', but a predicted label by model is '1/-1'.
22 | `,
23 | 			Action: labelconflict.DoLabelConflict,
24 | 			Flags: []cli.Flag{
25 | 				cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
26 | 			},
27 | 		},
28 | 		{
29 | 			Name:  "feature-weight",
30 | 			Usage: "List feature weight",
31 | 			Description: `
32 | List feature weight.
33 | `,
34 | 			Action: featureweight.DoListFeatureWeight,
35 | 			Flags: []cli.Flag{
36 | 				cli.BoolFlag{Name: "filter-status-code-ok", Usage: "Use only examples with status code = 200"},
37 | 			},
38 | 		},
39 | 	},
40 | }
41 | 


--------------------------------------------------------------------------------
/lib/diagnosis/feature_weight/feature_weight.go:
--------------------------------------------------------------------------------
 1 | package featureweight
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sort"
 6 | 
 7 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 8 | 	"github.com/syou6162/go-active-learning/lib/service"
 9 | 	"github.com/syou6162/go-active-learning/lib/util"
10 | 	"github.com/syou6162/go-active-learning/lib/util/converter"
11 | 	"github.com/urfave/cli"
12 | )
13 | 
14 | type Feature struct {
15 | 	Key    string
16 | 	Weight float64
17 | }
18 | 
19 | type FeatureList []Feature
20 | 
21 | func (p FeatureList) Len() int           { return len(p) }
22 | func (p FeatureList) Less(i, j int) bool { return p[i].Weight < p[j].Weight }
23 | func (p FeatureList) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
24 | 
25 | func DoListFeatureWeight(c *cli.Context) error {
26 | 	filterStatusCodeOk := c.Bool("filter-status-code-ok")
27 | 
28 | 	app, err := service.NewDefaultApp()
29 | 	if err != nil {
30 | 		return err
31 | 	}
32 | 	defer app.Close()
33 | 
34 | 	examples, err := app.SearchExamples()
35 | 	if err != nil {
36 | 		return err
37 | 	}
38 | 	app.Fetch(examples)
39 | 	for _, e := range examples {
40 | 		app.UpdateFeatureVector(e)
41 | 	}
42 | 	training := util.FilterLabeledExamples(examples)
43 | 
44 | 	if filterStatusCodeOk {
45 | 		training = util.FilterStatusCodeOkExamples(training)
46 | 	}
47 | 
48 | 	model, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(training))
49 | 	if err != nil {
50 | 		return err
51 | 	}
52 | 
53 | 	tmp := make(FeatureList, 0)
54 | 	for _, k := range model.GetActiveFeatures() {
55 | 		tmp = append(tmp, Feature{k, model.GetWeight(k)})
56 | 	}
57 | 	sort.Sort(sort.Reverse(tmp))
58 | 
59 | 	for _, p := range tmp {
60 | 		fmt.Println(fmt.Sprintf("%+0.2f\t%s", p.Weight, p.Key))
61 | 	}
62 | 
63 | 	return nil
64 | }
65 | 


--------------------------------------------------------------------------------
/lib/diagnosis/feature_weight/feature_weight_test.go:
--------------------------------------------------------------------------------
 1 | package featureweight_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/command"
 7 | 	"github.com/syou6162/go-active-learning/lib/service"
 8 | 	"github.com/syou6162/go-active-learning/lib/util/file"
 9 | 	"github.com/urfave/cli"
10 | )
11 | 
12 | func TestDoListFeatureWeight(t *testing.T) {
13 | 	inputFilename := "../../../tech_input_example.txt"
14 | 	train, err := file.ReadExamples(inputFilename)
15 | 	if err != nil {
16 | 		t.Error(err)
17 | 	}
18 | 
19 | 	a, err := service.NewDefaultApp()
20 | 	if err != nil {
21 | 		t.Error(err)
22 | 	}
23 | 	defer a.Close()
24 | 
25 | 	if err = a.DeleteAllExamples(); err != nil {
26 | 		t.Error(err)
27 | 	}
28 | 
29 | 	for _, example := range train {
30 | 		if err = a.UpdateOrCreateExample(example); err != nil {
31 | 			t.Error(err)
32 | 		}
33 | 	}
34 | 
35 | 	app := cli.NewApp()
36 | 	app.Commands = command.Commands
37 | 	args := []string{
38 | 		"go-active-learning",
39 | 		"diagnose",
40 | 		"feature-weight",
41 | 		"--filter-status-code-ok",
42 | 	}
43 | 
44 | 	if err := app.Run(args); err != nil {
45 | 		t.Error(err)
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/lib/diagnosis/label_conflict/label_conflict.go:
--------------------------------------------------------------------------------
 1 | package labelconflict
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"sort"
 7 | 	"strconv"
 8 | 
 9 | 	"encoding/csv"
10 | 
11 | 	"github.com/syou6162/go-active-learning/lib/classifier"
12 | 	"github.com/syou6162/go-active-learning/lib/model"
13 | 	"github.com/syou6162/go-active-learning/lib/service"
14 | 	"github.com/syou6162/go-active-learning/lib/util"
15 | 	"github.com/syou6162/go-active-learning/lib/util/converter"
16 | 	"github.com/urfave/cli"
17 | )
18 | 
19 | func DoLabelConflict(c *cli.Context) error {
20 | 	filterStatusCodeOk := c.Bool("filter-status-code-ok")
21 | 
22 | 	app, err := service.NewDefaultApp()
23 | 	if err != nil {
24 | 		return err
25 | 	}
26 | 	defer app.Close()
27 | 
28 | 	examples, err := app.SearchExamples()
29 | 	if err != nil {
30 | 		return err
31 | 	}
32 | 	app.Fetch(examples)
33 | 	for _, e := range examples {
34 | 		app.UpdateFeatureVector(e)
35 | 	}
36 | 	training := util.FilterLabeledExamples(examples)
37 | 
38 | 	if filterStatusCodeOk {
39 | 		training = util.FilterStatusCodeOkExamples(training)
40 | 	}
41 | 
42 | 	m, err := classifier.NewMIRAClassifierByCrossValidation(classifier.EXAMPLE, converter.ConvertExamplesToLearningInstances(training))
43 | 	if err != nil {
44 | 		return err
45 | 	}
46 | 
47 | 	wrongExamples := model.Examples{}
48 | 	correctExamples := model.Examples{}
49 | 
50 | 	for _, e := range training {
51 | 		e.Score = m.PredictScore(e.Fv)
52 | 		if float64(e.Label)*e.Score < 0 {
53 | 			wrongExamples = append(wrongExamples, e)
54 | 		} else {
55 | 			correctExamples = append(correctExamples, e)
56 | 		}
57 | 	}
58 | 
59 | 	sort.Sort(sort.Reverse(wrongExamples))
60 | 	sort.Sort(correctExamples)
61 | 	printResult(*m, correctExamples, wrongExamples)
62 | 
63 | 	return nil
64 | }
65 | 
66 | func printResult(m classifier.MIRAClassifier, correctExamples model.Examples, wrongExamples model.Examples) error {
67 | 	fmt.Println("Index\tLabel\tScore\tURL\tTitle")
68 | 	result := append(wrongExamples, correctExamples...)
69 | 
70 | 	w := csv.NewWriter(os.Stdout)
71 | 	w.Comma = '\t'
72 | 
73 | 	for idx, e := range result {
74 | 		record := []string{
75 | 			strconv.Itoa(idx),
76 | 			strconv.Itoa(int(e.Label)),
77 | 			fmt.Sprintf("%0.03f", m.PredictScore(e.Fv)),
78 | 			e.Url,
79 | 			e.Title,
80 | 		}
81 | 		if err := w.Write(record); err != nil {
82 | 			return err
83 | 		}
84 | 	}
85 | 
86 | 	w.Flush()
87 | 	if err := w.Error(); err != nil {
88 | 		return err
89 | 	}
90 | 
91 | 	return nil
92 | }
93 | 


--------------------------------------------------------------------------------
/lib/diagnosis/label_conflict/label_conflict_test.go:
--------------------------------------------------------------------------------
 1 | package labelconflict_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/command"
 7 | 	"github.com/syou6162/go-active-learning/lib/service"
 8 | 	"github.com/syou6162/go-active-learning/lib/util/file"
 9 | 	"github.com/urfave/cli"
10 | )
11 | 
12 | func TestDoLabelConflict(t *testing.T) {
13 | 	inputFilename := "../../../tech_input_example.txt"
14 | 	train, err := file.ReadExamples(inputFilename)
15 | 	if err != nil {
16 | 		t.Error(err)
17 | 	}
18 | 
19 | 	a, err := service.NewDefaultApp()
20 | 	if err != nil {
21 | 		t.Error(err)
22 | 	}
23 | 	defer a.Close()
24 | 
25 | 	if err = a.DeleteAllExamples(); err != nil {
26 | 		t.Error(err)
27 | 	}
28 | 
29 | 	for _, example := range train {
30 | 		if err = a.UpdateOrCreateExample(example); err != nil {
31 | 			t.Error(err)
32 | 		}
33 | 	}
34 | 
35 | 	app := cli.NewApp()
36 | 	app.Commands = command.Commands
37 | 	args := []string{
38 | 		"go-active-learning",
39 | 		"diagnose",
40 | 		"label-conflict",
41 | 	}
42 | 
43 | 	if err := app.Run(args); err != nil {
44 | 		t.Error(err)
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/lib/evaluation/evaluation.go:
--------------------------------------------------------------------------------
 1 | package evaluation
 2 | 
 3 | import (
 4 | 	"github.com/syou6162/go-active-learning/lib/model"
 5 | )
 6 | 
 7 | func GetAccuracy(gold []model.LabelType, predict []model.LabelType) float64 {
 8 | 	if len(gold) != len(predict) {
 9 | 		return 0.0
10 | 	}
11 | 	sum := 0.0
12 | 	for i, v := range gold {
13 | 		if v == predict[i] {
14 | 			sum += 1.0
15 | 		}
16 | 	}
17 | 	return sum / float64(len(gold))
18 | }
19 | 
20 | func GetPrecision(gold []model.LabelType, predict []model.LabelType) float64 {
21 | 	tp := 0.0
22 | 	fp := 0.0
23 | 	for i, v := range gold {
24 | 		if v == model.POSITIVE && predict[i] == model.POSITIVE {
25 | 			tp += 1.0
26 | 		}
27 | 		if v == model.NEGATIVE && predict[i] == model.POSITIVE {
28 | 			fp += 1.0
29 | 		}
30 | 	}
31 | 	return tp / (tp + fp)
32 | }
33 | 
34 | func GetRecall(gold []model.LabelType, predict []model.LabelType) float64 {
35 | 	tp := 0.0
36 | 	fn := 0.0
37 | 	for i, v := range gold {
38 | 		if v == model.POSITIVE && predict[i] == model.POSITIVE {
39 | 			tp += 1.0
40 | 		}
41 | 		if v == model.POSITIVE && predict[i] == model.NEGATIVE {
42 | 			fn += 1.0
43 | 		}
44 | 	}
45 | 	return tp / (tp + fn)
46 | }
47 | 
48 | func GetConfusionMatrix(gold []model.LabelType, predict []model.LabelType) (int, int, int, int) {
49 | 	tp := 0
50 | 	fp := 0
51 | 	fn := 0
52 | 	tn := 0
53 | 	for i, v := range gold {
54 | 		if v == model.POSITIVE && predict[i] == model.POSITIVE {
55 | 			tp += 1
56 | 		}
57 | 		if v == model.NEGATIVE && predict[i] == model.POSITIVE {
58 | 			fp += 1
59 | 		}
60 | 		if v == model.POSITIVE && predict[i] == model.NEGATIVE {
61 | 			fn += 1
62 | 		}
63 | 		if v == model.NEGATIVE && predict[i] == model.NEGATIVE {
64 | 			tn += 1
65 | 		}
66 | 	}
67 | 	return tp, fp, fn, tn
68 | }
69 | 


--------------------------------------------------------------------------------
/lib/evaluation/evaluation_test.go:
--------------------------------------------------------------------------------
 1 | package evaluation
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/syou6162/go-active-learning/lib/model"
 8 | )
 9 | 
10 | func TestGetAccuracy(t *testing.T) {
11 | 	gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE}
12 | 	predict := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.POSITIVE}
13 | 	accuracy := 0.75
14 | 
15 | 	if GetAccuracy(gold, predict) != accuracy {
16 | 		t.Error(fmt.Printf("Accuracy should be %f", accuracy))
17 | 	}
18 | }
19 | 
20 | func TestGetPrecision(t *testing.T) {
21 | 	gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE}
22 | 	predict := []model.LabelType{model.POSITIVE, model.NEGATIVE, model.NEGATIVE, model.POSITIVE}
23 | 	precision := 0.5
24 | 
25 | 	if GetPrecision(gold, predict) != precision {
26 | 		t.Error(fmt.Printf("Precision should be %f", precision))
27 | 	}
28 | }
29 | 
30 | func TestGetRecall(t *testing.T) {
31 | 	gold := []model.LabelType{model.POSITIVE, model.POSITIVE, model.NEGATIVE, model.NEGATIVE}
32 | 	predict := []model.LabelType{model.POSITIVE, model.NEGATIVE, model.NEGATIVE, model.POSITIVE}
33 | 	recall := 0.5
34 | 
35 | 	if GetRecall(gold, predict) != recall {
36 | 		t.Error(fmt.Printf("Recall should be %f", recall))
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/example/example.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"time"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/feature"
 7 | 	example_feature "github.com/syou6162/go-active-learning/lib/feature/example"
 8 | 	"github.com/syou6162/go-active-learning/lib/model"
 9 | )
10 | 
11 | func NewExample(url string, label model.LabelType) *model.Example {
12 | 	IsNew := false
13 | 	if label == model.UNLABELED {
14 | 		IsNew = true
15 | 	}
16 | 	now := time.Now()
17 | 	return &model.Example{
18 | 		Label:           label,
19 | 		Fv:              feature.FeatureVector{},
20 | 		Url:             url,
21 | 		FinalUrl:        url,
22 | 		Title:           "",
23 | 		Description:     "",
24 | 		OgDescription:   "",
25 | 		OgType:          "",
26 | 		OgImage:         "",
27 | 		Body:            "",
28 | 		Score:           0.0,
29 | 		IsNew:           IsNew,
30 | 		StatusCode:      0,
31 | 		Favicon:         "",
32 | 		ErrorCount:      0,
33 | 		CreatedAt:       now,
34 | 		UpdatedAt:       now,
35 | 		ReferringTweets: &model.ReferringTweets{},
36 | 		HatenaBookmark:  &model.HatenaBookmark{Bookmarks: make([]*model.Bookmark, 0)},
37 | 	}
38 | }
39 | 
40 | func GetStat(examples model.Examples) map[string]int {
41 | 	stat := make(map[string]int)
42 | 	for _, e := range examples {
43 | 		switch e.Label {
44 | 		case model.POSITIVE:
45 | 			stat["positive"]++
46 | 		case model.NEGATIVE:
47 | 			stat["negative"]++
48 | 		case model.UNLABELED:
49 | 			stat["unlabeled"]++
50 | 		}
51 | 	}
52 | 	return stat
53 | }
54 | 
55 | func ExtractFeatures(e model.Example) feature.FeatureVector {
56 | 	var fv feature.FeatureVector
57 | 	fv = append(fv, "BIAS")
58 | 	fv = append(fv, example_feature.ExtractHostFeature(e.FinalUrl))
59 | 	fv = append(fv, example_feature.ExtractJpnNounFeatures(example_feature.ExtractPath(e.FinalUrl), "URL")...)
60 | 	fv = append(fv, example_feature.ExtractNounFeatures(e.Title, "TITLE")...)
61 | 	fv = append(fv, example_feature.ExtractNounFeatures(e.Description, "DESCRIPTION")...)
62 | 	fv = append(fv, example_feature.ExtractNounFeatures(e.Body, "BODY")...)
63 | 	return fv
64 | }
65 | 


--------------------------------------------------------------------------------
/lib/feature/example/example.go:
--------------------------------------------------------------------------------
  1 | package example_feature
  2 | 
  3 | import (
  4 | 	"net/url"
  5 | 	"strings"
  6 | 	"sync"
  7 | 	"unicode"
  8 | 
  9 | 	"github.com/ikawaha/kagome/tokenizer"
 10 | 	"github.com/jdkato/prose/tag"
 11 | 	"github.com/jdkato/prose/tokenize"
 12 | 	"github.com/syou6162/go-active-learning/lib/feature"
 13 | )
 14 | 
 15 | var excludingWordList = []string{
 16 | 	`:`, `;`,
 17 | 	`,`, `.`,
 18 | 	`"`, `''`,
 19 | 	`+`, `-`, `*`, `/`, `|`, `++`, `--`,
 20 | 	`[`, `]`,
 21 | 	`{`, `}`,
 22 | 	`(`, `)`,
 23 | 	`<`, `>`,
 24 | 	`「`, `」`,
 25 | 	`／`,
 26 | 	`@`, `#`, `~`, `%`, `$`, `^`,
 27 | }
 28 | 
 29 | var (
 30 | 	japaneseTokenizer     *tokenizer.Tokenizer
 31 | 	japaneseTokenizerOnce sync.Once
 32 | 	englishTokenizer      *tokenize.TreebankWordTokenizer
 33 | 	englishTokenizerOnce  sync.Once
 34 | 	englishTagger         *tag.PerceptronTagger
 35 | 	englishTaggerOnce     sync.Once
 36 | 	excludingWordMapOnce  sync.Once
 37 | )
 38 | 
 39 | var excludingWordMap = make(map[string]bool)
 40 | 
 41 | func GetJapaneseTokenizer() *tokenizer.Tokenizer {
 42 | 	japaneseTokenizerOnce.Do(func() {
 43 | 		t := tokenizer.New()
 44 | 		japaneseTokenizer = &t
 45 | 	})
 46 | 
 47 | 	return japaneseTokenizer
 48 | }
 49 | 
 50 | func GetEnglishTokenizer() *tokenize.TreebankWordTokenizer {
 51 | 	englishTokenizerOnce.Do(func() {
 52 | 		englishTokenizer = tokenize.NewTreebankWordTokenizer()
 53 | 	})
 54 | 	return englishTokenizer
 55 | }
 56 | 
 57 | func GetEnglishTagger() *tag.PerceptronTagger {
 58 | 	englishTaggerOnce.Do(func() {
 59 | 		englishTagger = tag.NewPerceptronTagger()
 60 | 	})
 61 | 	return englishTagger
 62 | }
 63 | 
 64 | func isJapanese(str string) bool {
 65 | 	for _, r := range str {
 66 | 		if unicode.In(r, unicode.Hiragana) || unicode.In(r, unicode.Katakana) || unicode.In(r, unicode.Han) {
 67 | 			return true
 68 | 		}
 69 | 	}
 70 | 
 71 | 	if strings.ContainsAny(str, "。、") {
 72 | 		return true
 73 | 	}
 74 | 
 75 | 	return false
 76 | }
 77 | 
 78 | func IsExcludingWord(w string) bool {
 79 | 	excludingWordMapOnce.Do(func() {
 80 | 		for _, w := range excludingWordList {
 81 | 			excludingWordMap[w] = true
 82 | 		}
 83 | 	})
 84 | 	if _, ok := excludingWordMap[w]; ok {
 85 | 		return true
 86 | 	}
 87 | 	return false
 88 | }
 89 | 
 90 | func extractEngNounFeaturesWithoutPrefix(s string) feature.FeatureVector {
 91 | 	var fv feature.FeatureVector
 92 | 	if s == "" {
 93 | 		return fv
 94 | 	}
 95 | 
 96 | 	words := GetEnglishTokenizer().Tokenize(s)
 97 | 	tagger := GetEnglishTagger()
 98 | 	for _, tok := range tagger.Tag(words) {
 99 | 		if IsExcludingWord(tok.Text) {
100 | 			continue
101 | 		}
102 | 		switch tok.Tag {
103 | 		// https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
104 | 		case "NN", "NNS", "NNP", "NNPS", "PRP", "PRP$":
105 | 			fv = append(fv, strings.ToLower(tok.Text))
106 | 		}
107 | 	}
108 | 
109 | 	return fv
110 | }
111 | 
112 | func extractEngNounFeatures(s string, prefix string) feature.FeatureVector {
113 | 	var fv feature.FeatureVector
114 | 	for _, surface := range extractEngNounFeaturesWithoutPrefix(s) {
115 | 		fv = append(fv, prefix+":"+surface)
116 | 	}
117 | 	return fv
118 | }
119 | 
120 | func ExtractJpnNounFeaturesWithoutPrefix(s string) feature.FeatureVector {
121 | 	var fv feature.FeatureVector
122 | 	if s == "" {
123 | 		return fv
124 | 	}
125 | 	t := GetJapaneseTokenizer()
126 | 	tokens := t.Tokenize(strings.ToLower(s))
127 | 	for _, token := range tokens {
128 | 		if token.Pos() == "名詞" {
129 | 			surface := token.Surface
130 | 			if len(token.Features()) >= 2 && token.Features()[1] == "数" {
131 | 				surface = "NUM"
132 | 			}
133 | 			if IsExcludingWord(surface) {
134 | 				continue
135 | 			}
136 | 			fv = append(fv, surface)
137 | 		}
138 | 	}
139 | 	return fv
140 | }
141 | 
142 | func ExtractJpnNounFeatures(s string, prefix string) feature.FeatureVector {
143 | 	var fv feature.FeatureVector
144 | 	for _, surface := range ExtractJpnNounFeaturesWithoutPrefix(s) {
145 | 		fv = append(fv, prefix+":"+surface)
146 | 	}
147 | 	return fv
148 | }
149 | 
150 | func ExtractNounFeatures(s string, prefix string) feature.FeatureVector {
151 | 	if isJapanese(s) {
152 | 		return ExtractJpnNounFeatures(s, prefix)
153 | 	} else {
154 | 		return extractEngNounFeatures(s, prefix)
155 | 	}
156 | }
157 | 
158 | func ExtractNounFeaturesWithoutPrefix(s string) feature.FeatureVector {
159 | 	if isJapanese(s) {
160 | 		return ExtractJpnNounFeaturesWithoutPrefix(s)
161 | 	} else {
162 | 		return extractEngNounFeaturesWithoutPrefix(s)
163 | 	}
164 | }
165 | 
166 | func ExtractHostFeature(urlString string) string {
167 | 	prefix := "HOST"
168 | 	u, err := url.Parse(urlString)
169 | 	if err != nil {
170 | 		return prefix + ":INVALID_HOST"
171 | 	}
172 | 	return prefix + ":" + u.Host
173 | }
174 | 
175 | func ExtractPath(urlString string) string {
176 | 	path := ""
177 | 	u, err := url.Parse(urlString)
178 | 	if err != nil {
179 | 		return path
180 | 	}
181 | 	return u.Path
182 | }
183 | 


--------------------------------------------------------------------------------
/lib/feature/example/example_test.go:
--------------------------------------------------------------------------------
 1 | package example_feature
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestIsJapanese(t *testing.T) {
 9 | 	text := "ほげ"
10 | 	if !isJapanese(text) {
11 | 		t.Error(fmt.Printf("%s should be Japanese", text))
12 | 	}
13 | 	text = "文献紹介 / Youtube"
14 | 	if !isJapanese(text) {
15 | 		t.Error(fmt.Printf("%s should be Japanese", text))
16 | 	}
17 | 	text = "This is a pen."
18 | 	if isJapanese(text) {
19 | 		t.Error(fmt.Printf("%s should be not Japanese", text))
20 | 	}
21 | }
22 | 
23 | func TestJapaneseNounFeatures(t *testing.T) {
24 | 	text := "日本語のテストです"
25 | 	fv := ExtractJpnNounFeaturesWithoutPrefix(text)
26 | 	if len(fv) != 2 {
27 | 		t.Error(fmt.Printf("Size of feature vector for %s should be 2, but %d", text, len(fv)))
28 | 	}
29 | 	text = "文献紹介 / Youtube"
30 | 	fv = ExtractJpnNounFeaturesWithoutPrefix(text)
31 | 	if len(fv) != 3 {
32 | 		t.Error(fmt.Printf("Size of feature vector for %s should be 3, but %d", text, len(fv)))
33 | 	}
34 | }
35 | 
36 | func TestEngNounFeatures(t *testing.T) {
37 | 	text := "Hello World!"
38 | 	fv := extractEngNounFeatures(text, "")
39 | 	if len(fv) != 2 {
40 | 		t.Error(fmt.Printf("Size of feature vector for %s should be 2", text))
41 | 	}
42 | }
43 | 
44 | func TestExtractPath(t *testing.T) {
45 | 	url := "https://b.hatena.ne.jp/search/text?safe=on&q=nlp&users=50"
46 | 	path := "/search/text"
47 | 	if ExtractPath(url) != path {
48 | 		t.Error(fmt.Printf("path should be %s", path))
49 | 	}
50 | }
51 | 
52 | func TestExtractHostFeature(t *testing.T) {
53 | 	url := "https://b.hatena.ne.jp/search/text?safe=on&q=nlp&users=50"
54 | 	hostFeature := "HOST:b.hatena.ne.jp"
55 | 	if ExtractHostFeature(url) != hostFeature {
56 | 		t.Error(fmt.Printf("Host feature should be %s", hostFeature))
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/lib/feature/feature.go:
--------------------------------------------------------------------------------
 1 | package feature
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | )
 6 | 
 7 | type FeatureVector []string
 8 | 
 9 | func (fv *FeatureVector) MarshalBinary() ([]byte, error) {
10 | 	json, err := json.Marshal(fv)
11 | 	if err != nil {
12 | 		return nil, err
13 | 	}
14 | 	return []byte(json), nil
15 | }
16 | 
17 | func (fv *FeatureVector) UnmarshalBinary(data []byte) error {
18 | 	err := json.Unmarshal(data, fv)
19 | 	if err != nil {
20 | 		return err
21 | 	}
22 | 	return nil
23 | }
24 | 


--------------------------------------------------------------------------------
/lib/feature/tweet/tweet.go:
--------------------------------------------------------------------------------
  1 | package tweet_feature
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"regexp"
  6 | 
  7 | 	"github.com/syou6162/go-active-learning/lib/feature"
  8 | 	"github.com/syou6162/go-active-learning/lib/model"
  9 | 	"gopkg.in/vmarkovtsev/go-lcss.v1"
 10 | )
 11 | 
 12 | type ExampleAndTweet struct {
 13 | 	example       *model.Example
 14 | 	tweet         *model.Tweet
 15 | 	lcsLen        int
 16 | 	atMarksCnt    int
 17 | 	hashTagsCnt   int
 18 | 	cleanedText   string
 19 | 	cleanedLcsLen int
 20 | }
 21 | 
 22 | func (et *ExampleAndTweet) GetLabel() model.LabelType {
 23 | 	return et.tweet.Label
 24 | }
 25 | 
 26 | func GetExampleAndTweet(e *model.Example, t *model.Tweet) ExampleAndTweet {
 27 | 	result := ExampleAndTweet{example: e, tweet: t}
 28 | 	result.lcsLen = GetLCSLen(e.Title, t.FullText)
 29 | 
 30 | 	atRegexp := regexp.MustCompile(`@[^ ]+`)
 31 | 	result.atMarksCnt = len(atRegexp.FindAllStringSubmatch(t.FullText, -1))
 32 | 	str := atRegexp.ReplaceAllString(t.FullText, "")
 33 | 	hashRegexp := regexp.MustCompile(`#[^ ]+`)
 34 | 	result.hashTagsCnt = len(hashRegexp.FindAllStringSubmatch(t.FullText, -1))
 35 | 	result.cleanedText = hashRegexp.ReplaceAllString(str, "")
 36 | 	result.cleanedLcsLen = GetLCSLen(e.Title, result.cleanedText)
 37 | 	return result
 38 | }
 39 | 
 40 | func GetLCSLen(str1 string, str2 string) int {
 41 | 	return len(string(lcss.LongestCommonSubstring([]byte(str1), []byte(str2))))
 42 | }
 43 | 
 44 | func LCSLenFeature(et ExampleAndTweet) string {
 45 | 	prefix := "LCSLenFeature"
 46 | 	len := et.lcsLen
 47 | 	switch {
 48 | 	case len == 0:
 49 | 		return fmt.Sprintf("%s:0", prefix)
 50 | 	case len < 5:
 51 | 		return fmt.Sprintf("%s:5", prefix)
 52 | 	case len < 10:
 53 | 		return fmt.Sprintf("%s:10", prefix)
 54 | 	case len < 25:
 55 | 		return fmt.Sprintf("%s:25", prefix)
 56 | 	case len < 50:
 57 | 		return fmt.Sprintf("%s:50", prefix)
 58 | 	case len < 100:
 59 | 		return fmt.Sprintf("%s:100", prefix)
 60 | 	default:
 61 | 		return fmt.Sprintf("%s:INF", prefix)
 62 | 	}
 63 | }
 64 | 
 65 | func CleanedLCSLenFeature(et ExampleAndTweet) string {
 66 | 	prefix := "CleanedLCSLenFeature"
 67 | 	len := et.cleanedLcsLen
 68 | 	switch {
 69 | 	case len == 0:
 70 | 		return fmt.Sprintf("%s:0", prefix)
 71 | 	case len < 5:
 72 | 		return fmt.Sprintf("%s:5", prefix)
 73 | 	case len < 10:
 74 | 		return fmt.Sprintf("%s:10", prefix)
 75 | 	case len < 25:
 76 | 		return fmt.Sprintf("%s:25", prefix)
 77 | 	case len < 50:
 78 | 		return fmt.Sprintf("%s:50", prefix)
 79 | 	case len < 100:
 80 | 		return fmt.Sprintf("%s:100", prefix)
 81 | 	default:
 82 | 		return fmt.Sprintf("%s:INF", prefix)
 83 | 	}
 84 | }
 85 | 
 86 | func LCSRatioFeature(et ExampleAndTweet) string {
 87 | 	prefix := "LCSRatioFeature"
 88 | 	ratio := float64(et.lcsLen) / float64(len(et.tweet.FullText))
 89 | 	switch {
 90 | 	case ratio == 0.0:
 91 | 		return fmt.Sprintf("%s:0.0", prefix)
 92 | 	case ratio < 0.1:
 93 | 		return fmt.Sprintf("%s:0.1", prefix)
 94 | 	case ratio < 0.25:
 95 | 		return fmt.Sprintf("%s:0.25", prefix)
 96 | 	case ratio < 0.5:
 97 | 		return fmt.Sprintf("%s:0.5", prefix)
 98 | 	case ratio < 0.75:
 99 | 		return fmt.Sprintf("%s:0.75", prefix)
100 | 	case ratio < 0.9:
101 | 		return fmt.Sprintf("%s:0.0", prefix)
102 | 	default:
103 | 		return fmt.Sprintf("%s:1.0", prefix)
104 | 	}
105 | }
106 | 
107 | func CleanedLCSRatioFeature(et ExampleAndTweet) string {
108 | 	prefix := "CleanedLCSRatioFeature"
109 | 	ratio := float64(et.cleanedLcsLen) / float64(len(et.tweet.FullText))
110 | 	switch {
111 | 	case ratio == 0.0:
112 | 		return fmt.Sprintf("%s:0.0", prefix)
113 | 	case ratio < 0.1:
114 | 		return fmt.Sprintf("%s:0.1", prefix)
115 | 	case ratio < 0.25:
116 | 		return fmt.Sprintf("%s:0.25", prefix)
117 | 	case ratio < 0.5:
118 | 		return fmt.Sprintf("%s:0.5", prefix)
119 | 	case ratio < 0.75:
120 | 		return fmt.Sprintf("%s:0.75", prefix)
121 | 	case ratio < 0.9:
122 | 		return fmt.Sprintf("%s:0.0", prefix)
123 | 	default:
124 | 		return fmt.Sprintf("%s:1.0", prefix)
125 | 	}
126 | }
127 | 
128 | func FavoriteCountFeature(et ExampleAndTweet) string {
129 | 	prefix := "FavoriteCountFeature"
130 | 	cnt := et.tweet.FavoriteCount
131 | 	switch {
132 | 	case cnt == 0:
133 | 		return fmt.Sprintf("%s:0", prefix)
134 | 	case cnt == 1:
135 | 		return fmt.Sprintf("%s:1", prefix)
136 | 	case cnt <= 3:
137 | 		return fmt.Sprintf("%s:3", prefix)
138 | 	case cnt <= 5:
139 | 		return fmt.Sprintf("%s:5", prefix)
140 | 	case cnt <= 10:
141 | 		return fmt.Sprintf("%s:10", prefix)
142 | 	case cnt <= 25:
143 | 		return fmt.Sprintf("%s:25", prefix)
144 | 	case cnt <= 50:
145 | 		return fmt.Sprintf("%s:50", prefix)
146 | 	case cnt <= 100:
147 | 		return fmt.Sprintf("%s:100", prefix)
148 | 	default:
149 | 		return fmt.Sprintf("%s:INF", prefix)
150 | 	}
151 | }
152 | 
153 | func RetweetCountFeature(et ExampleAndTweet) string {
154 | 	prefix := "RetweetCountFeature"
155 | 	cnt := et.tweet.RetweetCount
156 | 	switch {
157 | 	case cnt == 0:
158 | 		return fmt.Sprintf("%s:0", prefix)
159 | 	case cnt == 1:
160 | 		return fmt.Sprintf("%s:1", prefix)
161 | 	case cnt <= 3:
162 | 		return fmt.Sprintf("%s:3", prefix)
163 | 	case cnt <= 5:
164 | 		return fmt.Sprintf("%s:5", prefix)
165 | 	case cnt <= 10:
166 | 		return fmt.Sprintf("%s:10", prefix)
167 | 	case cnt <= 25:
168 | 		return fmt.Sprintf("%s:25", prefix)
169 | 	case cnt <= 50:
170 | 		return fmt.Sprintf("%s:50", prefix)
171 | 	case cnt <= 100:
172 | 		return fmt.Sprintf("%s:100", prefix)
173 | 	default:
174 | 		return fmt.Sprintf("%s:INF", prefix)
175 | 	}
176 | }
177 | 
178 | func AtMarksCountFeature(et ExampleAndTweet) string {
179 | 	prefix := "AtMarksCountFeature"
180 | 	cnt := et.atMarksCnt
181 | 	switch {
182 | 	case cnt == 0:
183 | 		return fmt.Sprintf("%s:0", prefix)
184 | 	case cnt == 1:
185 | 		return fmt.Sprintf("%s:1", prefix)
186 | 	case cnt <= 3:
187 | 		return fmt.Sprintf("%s:3", prefix)
188 | 	case cnt <= 5:
189 | 		return fmt.Sprintf("%s:5", prefix)
190 | 	case cnt <= 10:
191 | 		return fmt.Sprintf("%s:10", prefix)
192 | 	default:
193 | 		return fmt.Sprintf("%s:INF", prefix)
194 | 	}
195 | }
196 | 
197 | func HashTagsCountFeature(et ExampleAndTweet) string {
198 | 	prefix := "HashTagsCountFeature"
199 | 	cnt := et.atMarksCnt
200 | 	switch {
201 | 	case cnt == 0:
202 | 		return fmt.Sprintf("%s:0", prefix)
203 | 	case cnt == 1:
204 | 		return fmt.Sprintf("%s:1", prefix)
205 | 	case cnt <= 3:
206 | 		return fmt.Sprintf("%s:3", prefix)
207 | 	case cnt <= 5:
208 | 		return fmt.Sprintf("%s:5", prefix)
209 | 	case cnt <= 10:
210 | 		return fmt.Sprintf("%s:10", prefix)
211 | 	default:
212 | 		return fmt.Sprintf("%s:INF", prefix)
213 | 	}
214 | }
215 | 
216 | func TextLengthFeature(et ExampleAndTweet) string {
217 | 	prefix := "TextLengthFeature"
218 | 	cnt := len(et.tweet.FullText)
219 | 	switch {
220 | 	case cnt == 0:
221 | 		return fmt.Sprintf("%s:0", prefix)
222 | 	case cnt == 1:
223 | 		return fmt.Sprintf("%s:1", prefix)
224 | 	case cnt == 3:
225 | 		return fmt.Sprintf("%s:3", prefix)
226 | 	case cnt < 5:
227 | 		return fmt.Sprintf("%s:5", prefix)
228 | 	case cnt < 10:
229 | 		return fmt.Sprintf("%s:10", prefix)
230 | 	case cnt < 25:
231 | 		return fmt.Sprintf("%s:25", prefix)
232 | 	case cnt < 50:
233 | 		return fmt.Sprintf("%s:50", prefix)
234 | 	case cnt < 100:
235 | 		return fmt.Sprintf("%s:100", prefix)
236 | 	default:
237 | 		return fmt.Sprintf("%s:INF", prefix)
238 | 	}
239 | }
240 | 
241 | func CleanedTextLengthFeature(et ExampleAndTweet) string {
242 | 	prefix := "CleanedTextLengthFeature"
243 | 	cnt := len(et.cleanedText)
244 | 	switch {
245 | 	case cnt == 0:
246 | 		return fmt.Sprintf("%s:0", prefix)
247 | 	case cnt == 1:
248 | 		return fmt.Sprintf("%s:1", prefix)
249 | 	case cnt == 3:
250 | 		return fmt.Sprintf("%s:3", prefix)
251 | 	case cnt < 5:
252 | 		return fmt.Sprintf("%s:5", prefix)
253 | 	case cnt < 10:
254 | 		return fmt.Sprintf("%s:10", prefix)
255 | 	case cnt < 25:
256 | 		return fmt.Sprintf("%s:25", prefix)
257 | 	case cnt < 50:
258 | 		return fmt.Sprintf("%s:50", prefix)
259 | 	case cnt < 100:
260 | 		return fmt.Sprintf("%s:100", prefix)
261 | 	default:
262 | 		return fmt.Sprintf("%s:INF", prefix)
263 | 	}
264 | }
265 | 
266 | func ScreenNameFeature(et ExampleAndTweet) string {
267 | 	prefix := "ScreenNameFeature"
268 | 	return fmt.Sprintf("%s:%s", prefix, et.tweet.ScreenName)
269 | }
270 | 
271 | func (et *ExampleAndTweet) GetFeatureVector() feature.FeatureVector {
272 | 	var fv feature.FeatureVector
273 | 
274 | 	fv = append(fv, "BIAS")
275 | 	fv = append(fv, LCSLenFeature(*et))
276 | 	fv = append(fv, CleanedLCSLenFeature(*et))
277 | 	fv = append(fv, LCSRatioFeature(*et))
278 | 	fv = append(fv, CleanedLCSRatioFeature(*et))
279 | 	fv = append(fv, TextLengthFeature(*et))
280 | 	fv = append(fv, CleanedTextLengthFeature(*et))
281 | 
282 | 	fv = append(fv, ScreenNameFeature(*et))
283 | 	fv = append(fv, FavoriteCountFeature(*et))
284 | 	fv = append(fv, RetweetCountFeature(*et))
285 | 	fv = append(fv, AtMarksCountFeature(*et))
286 | 	fv = append(fv, HashTagsCountFeature(*et))
287 | 	return fv
288 | }
289 | 


--------------------------------------------------------------------------------
/lib/feature/tweet/tweet_test.go:
--------------------------------------------------------------------------------
 1 | package tweet_feature
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/syou6162/go-active-learning/lib/feature"
 8 | 	"github.com/syou6162/go-active-learning/lib/model"
 9 | )
10 | 
11 | func TestExtractHostFeature(t *testing.T) {
12 | 	e := model.Example{}
13 | 	e.Title = "Hello world"
14 | 	tweet := model.Tweet{}
15 | 	tweet.ScreenName = "syou6162"
16 | 	tweet.FullText = "Hello world @syou6162 @syou6163 #hashtag1 #hashtag2"
17 | 	tweet.FavoriteCount = 7
18 | 	tweet.RetweetCount = 7
19 | 
20 | 	et := GetExampleAndTweet(&e, &tweet)
21 | 	fv := et.GetFeatureVector()
22 | 	expect := feature.FeatureVector{
23 | 		"BIAS",
24 | 		"LCSLenFeature:25",
25 | 		"CleanedLCSLenFeature:25",
26 | 		"LCSRatioFeature:0.25",
27 | 		"CleanedLCSRatioFeature:0.25",
28 | 		"TextLengthFeature:100",
29 | 		"CleanedTextLengthFeature:25",
30 | 		"ScreenNameFeature:syou6162",
31 | 		"FavoriteCountFeature:10",
32 | 		"RetweetCountFeature:10",
33 | 		"AtMarksCountFeature:3",
34 | 		"HashTagsCountFeature:3",
35 | 	}
36 | 	if !reflect.DeepEqual(expect, fv) {
37 | 		t.Error("feature must be wrong")
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/lib/fetcher/fetcher.go:
--------------------------------------------------------------------------------
  1 | package fetcher
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"io/ioutil"
  7 | 	"net/http"
  8 | 	"regexp"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"net/url"
 13 | 	"unicode/utf8"
 14 | 
 15 | 	"github.com/PuerkitoBio/goquery"
 16 | 	goose "github.com/syou6162/GoOse"
 17 | )
 18 | 
 19 | type Article struct {
 20 | 	Url           string
 21 | 	Title         string
 22 | 	Description   string
 23 | 	OgDescription string
 24 | 	OgType        string
 25 | 	OgImage       string
 26 | 	Body          string
 27 | 	StatusCode    int
 28 | 	Favicon       string
 29 | 	PublishDate   *time.Time
 30 | }
 31 | 
 32 | var articleFetcher = http.Client{
 33 | 	Transport: &http.Transport{
 34 | 		MaxIdleConns:        0,
 35 | 		MaxIdleConnsPerHost: 100,
 36 | 	},
 37 | 	Timeout: time.Duration(5 * time.Second),
 38 | }
 39 | 
 40 | func updateTitleIfArxiv(article *goose.Article, origUrl string, finalUrl string, html []byte) error {
 41 | 	arxivUrl := "https://arxiv.org/abs/"
 42 | 	if strings.Contains(origUrl, arxivUrl) || strings.Contains(finalUrl, arxivUrl) {
 43 | 		// arxivのhtml内にはtitleタグが複数存在するので、丁寧にタイトルを取得する...
 44 | 		re := regexp.MustCompile(`<title>(.*?)</title>`)
 45 | 		m := re.FindSubmatch(html)
 46 | 		if len(m) >= 2 {
 47 | 			article.Title = string(m[1])
 48 | 		}
 49 | 	}
 50 | 	return nil
 51 | }
 52 | 
 53 | func updateMetaDescriptionIfArxiv(article *goose.Article, origUrl string, finalUrl string, html []byte) error {
 54 | 	arxivUrl := "https://arxiv.org/abs/"
 55 | 	if strings.Contains(origUrl, arxivUrl) || strings.Contains(finalUrl, arxivUrl) {
 56 | 		// article.Docでもいけそうだが、gooseが中で書き換えていてダメ。Documentを作りなおす
 57 | 		doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(html)))
 58 | 		if err != nil {
 59 | 			return err
 60 | 		}
 61 | 		article.MetaDescription = doc.Find(".abstract").Text()
 62 | 	}
 63 | 	return nil
 64 | }
 65 | 
 66 | func removeUtmParams(origUrl string) (string, error) {
 67 | 	u, err := url.Parse(origUrl)
 68 | 	if err != nil {
 69 | 		return origUrl, err
 70 | 	}
 71 | 
 72 | 	q, err := url.ParseQuery(u.RawQuery)
 73 | 	if err != nil {
 74 | 		return origUrl, err
 75 | 	}
 76 | 
 77 | 	q.Del("utm_source")
 78 | 	q.Del("utm_medium")
 79 | 	q.Del("utm_campaign")
 80 | 	q.Del("utm_term")
 81 | 	q.Del("utm_content")
 82 | 
 83 | 	q.Del("gi")
 84 | 
 85 | 	u.RawQuery = q.Encode()
 86 | 
 87 | 	return u.String(), nil
 88 | }
 89 | 
 90 | func GetArticle(origUrl string) (*Article, error) {
 91 | 	g := goose.New()
 92 | 	resp, err := articleFetcher.Get(origUrl)
 93 | 	if err != nil {
 94 | 		return nil, err
 95 | 	}
 96 | 	if resp.StatusCode == http.StatusFound ||
 97 | 		resp.StatusCode == http.StatusUnauthorized ||
 98 | 		resp.StatusCode == http.StatusForbidden ||
 99 | 		resp.StatusCode == http.StatusNotFound ||
100 | 		resp.StatusCode == http.StatusGone ||
101 | 		resp.StatusCode == http.StatusBadGateway ||
102 | 		resp.StatusCode == http.StatusServiceUnavailable {
103 | 		return nil, errors.New(fmt.Sprintf("%s: Cannot fetch %s", resp.Status, origUrl))
104 | 	}
105 | 	defer resp.Body.Close()
106 | 
107 | 	html, err := ioutil.ReadAll(resp.Body)
108 | 	if err != nil {
109 | 		return nil, err
110 | 	}
111 | 
112 | 	if !utf8.Valid(html) {
113 | 		return nil, errors.New(fmt.Sprintf("Invalid utf8 document: %s", origUrl))
114 | 	}
115 | 
116 | 	article, err := g.ExtractFromRawHTML(resp.Request.URL.String(), string(html))
117 | 	if err != nil {
118 | 		return nil, err
119 | 	}
120 | 
121 | 	finalUrl := article.CanonicalLink
122 | 	if finalUrl == "" {
123 | 		finalUrl = resp.Request.URL.String()
124 | 	}
125 | 
126 | 	finalUrl, err = removeUtmParams(finalUrl)
127 | 	if err != nil {
128 | 		return nil, err
129 | 	}
130 | 
131 | 	updateTitleIfArxiv(article, origUrl, finalUrl, html)
132 | 	updateMetaDescriptionIfArxiv(article, origUrl, finalUrl, html)
133 | 
134 | 	favicon := ""
135 | 	if u, err := url.Parse(article.MetaFavicon); err == nil {
136 | 		if u.IsAbs() {
137 | 			favicon = article.MetaFavicon
138 | 		}
139 | 	}
140 | 
141 | 	return &Article{
142 | 		Url:           finalUrl,
143 | 		Title:         article.Title,
144 | 		Description:   article.MetaDescription,
145 | 		OgDescription: article.MetaOgDescription,
146 | 		OgType:        article.MetaOgType,
147 | 		OgImage:       article.MetaOgImage,
148 | 		Body:          article.CleanedText,
149 | 		StatusCode:    resp.StatusCode,
150 | 		Favicon:       favicon,
151 | 		PublishDate:   article.PublishDate,
152 | 	}, nil
153 | }
154 | 


--------------------------------------------------------------------------------
/lib/fetcher/fetcher_test.go:
--------------------------------------------------------------------------------
  1 | package fetcher
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestGetArticle(t *testing.T) {
  9 | 	a, err := GetArticle("https://www.yasuhisay.info/entry/20090516/1242480413")
 10 | 	if err != nil {
 11 | 		t.Error(err.Error())
 12 | 	}
 13 | 
 14 | 	if a.Title == "" {
 15 | 		t.Error("Title must not be empty")
 16 | 	}
 17 | 	if a.Description == "" {
 18 | 		t.Error("Description must not be empty")
 19 | 	}
 20 | 	if a.OgType != "article" {
 21 | 		t.Error("OgType must be article")
 22 | 	}
 23 | 	if a.StatusCode != 200 {
 24 | 		t.Error("StatusCode must be 200")
 25 | 	}
 26 | }
 27 | 
 28 | func TestGetArticleARXIV(t *testing.T) {
 29 | 	a, err := GetArticle("https://arxiv.org/abs/2012.07805")
 30 | 	if err != nil {
 31 | 		t.Error(err.Error())
 32 | 	}
 33 | 
 34 | 	if a.Title != "[2012.07805] Extracting Training Data from Large Language Models" {
 35 | 		t.Error("Title must not be empty")
 36 | 	}
 37 | 	if a.Description == "" {
 38 | 		t.Error("Description must not be empty")
 39 | 	}
 40 | 	if a.StatusCode != 200 {
 41 | 		t.Error("StatusCode must be 200")
 42 | 	}
 43 | }
 44 | 
 45 | func TestGetArticleNotFound(t *testing.T) {
 46 | 	_, err := GetArticle("https://www.yasuhisay.info/entry/NOT_FOUND")
 47 | 	if err == nil {
 48 | 		t.Error("Error should occur")
 49 | 	}
 50 | }
 51 | 
 52 | func TestGetArticleWithInvalidEncoding(t *testing.T) {
 53 | 	url := "http://www.atmarkit.co.jp/ait/articles/1702/20/news021.html"
 54 | 	_, err := GetArticle(url)
 55 | 	if err == nil {
 56 | 		t.Error(fmt.Sprintf("Error must occur for this url: %s", url))
 57 | 	}
 58 | }
 59 | 
 60 | func TestRemoveUtmParams(t *testing.T) {
 61 | 	before := "https://techplay.jp/event/698349?utm_source=event_698349"
 62 | 	after, err := removeUtmParams(before)
 63 | 	if err != nil {
 64 | 		t.Error(fmt.Sprintf("Error must occur for this url: %s", before))
 65 | 	}
 66 | 	expected := "https://techplay.jp/event/698349"
 67 | 	if expected != after {
 68 | 		t.Errorf("url should be %s, but %s", expected, after)
 69 | 	}
 70 | 	a, err := GetArticle(before)
 71 | 	if expected != a.Url {
 72 | 		t.Errorf("url should be %s, but %s", expected, a.Url)
 73 | 	}
 74 | }
 75 | 
 76 | func TestFavicon(t *testing.T) {
 77 | 	url := "https://www.yasuhisay.info/entry/2020/11/22/190000"
 78 | 	a, err := GetArticle(url)
 79 | 	if err != nil {
 80 | 		t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
 81 | 	}
 82 | 	expectedFaviconPath := "https://www.yasuhisay.info/icon/favicon"
 83 | 	if expectedFaviconPath != a.Favicon {
 84 | 		t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
 85 | 	}
 86 | 
 87 | 	url = "https://www.lifehacker.jp/2018/11/amazon-impact-absorption-case.html"
 88 | 	a, err = GetArticle(url)
 89 | 	if err != nil {
 90 | 		t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
 91 | 	}
 92 | 	expectedFaviconPath = "https://www.lifehacker.jp/assets/common/img/favicon.ico"
 93 | 	if expectedFaviconPath != a.Favicon {
 94 | 		t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
 95 | 	}
 96 | 
 97 | 	url = "https://peterroelants.github.io/"
 98 | 	a, err = GetArticle(url)
 99 | 	if err != nil {
100 | 		t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
101 | 	}
102 | 	expectedFaviconPath = "https://peterroelants.github.io/images/favicon/apple-icon-57x57.png"
103 | 	if expectedFaviconPath != a.Favicon {
104 | 		t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
105 | 	}
106 | 
107 | 	url = "https://www.getrevue.co/profile/icoxfog417/issues/weekly-machine-learning-79-121292"
108 | 	a, err = GetArticle(url)
109 | 	if err != nil {
110 | 		t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
111 | 	}
112 | 	expectedFaviconPath = "https://d3jbm9h03wxzi9.cloudfront.net/assets/favicon-84fc7f228d52c2410eb7aa839e279caeaa491588c7c75229ed33e1c7f69fe75d.ico"
113 | 	if expectedFaviconPath != a.Favicon {
114 | 		t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
115 | 	}
116 | 
117 | 	url = "https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html"
118 | 	a, err = GetArticle(url)
119 | 	if err != nil {
120 | 		t.Error(fmt.Sprintf("Error must not occur for this url: %s", url))
121 | 	}
122 | 	expectedFaviconPath = "https://ai.googleblog.com/favicon.ico"
123 | 	if expectedFaviconPath != a.Favicon {
124 | 		t.Errorf("Favicon: %s should be %s", a.Favicon, expectedFaviconPath)
125 | 	}
126 | }
127 | 
128 | func TestGetPublishDate(t *testing.T) {
129 | 	a, err := GetArticle("https://www.yasuhisay.info/entry/2019/11/18/153000")
130 | 	if err != nil {
131 | 		t.Error("Error should not occur")
132 | 	}
133 | 	if a.PublishDate == nil {
134 | 		t.Error("PublishDate must not be nil")
135 | 	}
136 | }
137 | 


--------------------------------------------------------------------------------
/lib/hatena_bookmark/hatena_bookmark.go:
--------------------------------------------------------------------------------
 1 | package hatena_bookmark
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"io/ioutil"
 7 | 	"net/http"
 8 | 
 9 | 	"github.com/syou6162/go-active-learning/lib/model"
10 | )
11 | 
12 | func GetHatenaBookmark(url string) (*model.HatenaBookmark, error) {
13 | 	// ref: http://developer.hatena.ne.jp/ja/documents/bookmark/apis/getinfo
14 | 	res, err := http.Get(fmt.Sprintf("https://b.hatena.ne.jp/entry/jsonlite/?url=%s", url))
15 | 	if err != nil {
16 | 		return nil, err
17 | 	}
18 | 	if res.StatusCode != http.StatusOK {
19 | 		return nil, fmt.Errorf("error: %d", res.StatusCode)
20 | 	}
21 | 
22 | 	defer res.Body.Close()
23 | 	body, error := ioutil.ReadAll(res.Body)
24 | 	if error != nil {
25 | 		return nil, err
26 | 	}
27 | 
28 | 	bookmarks := model.HatenaBookmark{}
29 | 	err = json.Unmarshal(body, &bookmarks)
30 | 	if error != nil {
31 | 		return nil, err
32 | 	}
33 | 	return &bookmarks, nil
34 | }
35 | 


--------------------------------------------------------------------------------
/lib/hatena_bookmark/hatena_bookmark_test.go:
--------------------------------------------------------------------------------
 1 | package hatena_bookmark
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestGetHatenaBookmark(t *testing.T) {
 8 | 	bookmarks, err := GetHatenaBookmark("https://www.yasuhisay.info")
 9 | 	if err != nil {
10 | 		t.Error(err.Error())
11 | 	}
12 | 
13 | 	if bookmarks.Title == "" {
14 | 		t.Error("Title must not be empty")
15 | 	}
16 | 	if bookmarks.Count == 0 {
17 | 		t.Error("Count must not be 0")
18 | 	}
19 | 	if len(bookmarks.Bookmarks) == 0 {
20 | 		t.Error("Count must not be 0")
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/lib/model/error.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | type notFoundError string
 4 | 
 5 | func (err notFoundError) Error() string {
 6 | 	return string(err) + " not found"
 7 | }
 8 | 
 9 | func NotFoundError(typ string) error {
10 | 	return notFoundError(typ)
11 | }
12 | 
13 | func IsNotFound(err error) bool {
14 | 	_, ok := err.(notFoundError)
15 | 	return ok
16 | }
17 | 


--------------------------------------------------------------------------------
/lib/model/example.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"strings"
 6 | 	"time"
 7 | 
 8 | 	"github.com/syou6162/go-active-learning/lib/feature"
 9 | )
10 | 
11 | type Example struct {
12 | 	Id              int       `db:"id"`
13 | 	Label           LabelType `json:"Label" db:"label"`
14 | 	Fv              feature.FeatureVector
15 | 	Url             string           `json:"Url" db:"url"`
16 | 	FinalUrl        string           `json:"FinalUrl" db:"final_url"`
17 | 	Title           string           `json:"Title" db:"title"`
18 | 	Description     string           `json:"Description" db:"description"`
19 | 	OgDescription   string           `json:"OgDescription" db:"og_description"`
20 | 	OgType          string           `json:"OgType" db:"og_type"`
21 | 	OgImage         string           `json:"OgImage" db:"og_image"`
22 | 	Body            string           `json:"Body" db:"body"`
23 | 	Score           float64          `db:"score"`
24 | 	IsNew           bool             `db:"is_new"`
25 | 	StatusCode      int              `json:"StatusCode" db:"status_code"`
26 | 	Favicon         string           `json:"Favicon" db:"favicon"`
27 | 	ErrorCount      int              `json:"ErrorCount" db:"error_count"`
28 | 	CreatedAt       time.Time        `json:"CreatedAt" db:"created_at"`
29 | 	UpdatedAt       time.Time        `json:"UpdatedAt" db:"updated_at"`
30 | 	ReferringTweets *ReferringTweets `json:"ReferringTweets"`
31 | 	HatenaBookmark  *HatenaBookmark  `json:"HatenaBookmark"`
32 | }
33 | 
34 | type Examples []*Example
35 | 
36 | func (example *Example) GetLabel() LabelType {
37 | 	return example.Label
38 | }
39 | 
40 | func (example *Example) GetFeatureVector() feature.FeatureVector {
41 | 	return example.Fv
42 | }
43 | 
44 | func (example *Example) Annotate(label LabelType) {
45 | 	example.Label = label
46 | }
47 | 
48 | func (example *Example) IsLabeled() bool {
49 | 	return example.Label != UNLABELED
50 | }
51 | 
52 | func (example *Example) IsTwitterUrl() bool {
53 | 	twitterUrl := "https://twitter.com"
54 | 	return strings.Contains(example.Url, twitterUrl) || strings.Contains(example.FinalUrl, twitterUrl)
55 | }
56 | 
57 | func (example *Example) IsArticle() bool {
58 | 	// twitterはarticleと返ってくるが除外
59 | 	return example.OgType == "article" && !example.IsTwitterUrl()
60 | }
61 | 
62 | func (slice Examples) Len() int {
63 | 	return len(slice)
64 | }
65 | 
66 | func (slice Examples) Less(i, j int) bool {
67 | 	return math.Abs(slice[i].Score) < math.Abs(slice[j].Score)
68 | }
69 | 
70 | func (slice Examples) Swap(i, j int) {
71 | 	slice[i], slice[j] = slice[j], slice[i]
72 | }
73 | 


--------------------------------------------------------------------------------
/lib/model/hatena_bookmark.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import (
 4 | 	"database/sql/driver"
 5 | 	"encoding/json"
 6 | 	"strings"
 7 | 	"time"
 8 | )
 9 | 
10 | type Tags []string
11 | 
12 | type HatenaBookmarkTime struct {
13 | 	*time.Time
14 | }
15 | 
16 | // ref: https://dev.classmethod.jp/go/struct-json/
17 | func (hbt *HatenaBookmarkTime) UnmarshalJSON(data []byte) error {
18 | 	t, err := time.Parse("\"2006/01/02 15:04\"", string(data))
19 | 	*hbt = HatenaBookmarkTime{&t}
20 | 	return err
21 | }
22 | 
23 | func (hbt HatenaBookmarkTime) MarshalJSON() ([]byte, error) {
24 | 	return json.Marshal(hbt.Format("2006/01/02 15:04"))
25 | }
26 | 
27 | // ref: https://qiita.com/roothybrid7/items/52623bedb45ff0c26a8a
28 | func (hbt *HatenaBookmarkTime) Scan(value interface{}) error {
29 | 	v := value.(time.Time)
30 | 	hbt.Time = &v
31 | 	return nil
32 | }
33 | 
34 | func (hbt HatenaBookmarkTime) Value() (driver.Value, error) {
35 | 	return *hbt.Time, nil
36 | }
37 | 
38 | func (tags *Tags) Scan(value interface{}) error {
39 | 	s := value.(string)
40 | 	if s == "" {
41 | 		*tags = Tags{}
42 | 		return nil
43 | 	}
44 | 	v := strings.Split(s, "\t")
45 | 	*tags = append(*tags, v...)
46 | 	return nil
47 | }
48 | 
49 | func (tags Tags) Value() (driver.Value, error) {
50 | 	return strings.Join(tags, "\t"), nil
51 | }
52 | 
53 | type Bookmark struct {
54 | 	HatenaBookmarkId int                `db:"hatena_bookmark_id"`
55 | 	Timestamp        HatenaBookmarkTime `json:"timestamp" db:"timestamp"`
56 | 	User             string             `json:"user" db:"user"`
57 | 	Tags             Tags               `json:"tags" db:"tags"`
58 | 	Comment          string             `json:"comment" db:"comment"`
59 | }
60 | 
61 | type HatenaBookmark struct {
62 | 	Id         int         `db:"id"`
63 | 	ExampleId  int         `db:"example_id"`
64 | 	Title      string      `json:"title" db:"title"`
65 | 	Bookmarks  []*Bookmark `json:"bookmarks"`
66 | 	Screenshot string      `json:"screenshot" db:"screenshot"`
67 | 	EntryUrl   string      `json:"entry_url" db:"entry_url"`
68 | 	Count      int         `json:"count" db:"count"`
69 | 	Url        string      `json:"url" db:"url"`
70 | 	EId        string      `json:"eid" db:"eid"`
71 | }
72 | 
73 | func (bookmarks *HatenaBookmark) MarshalBinary() ([]byte, error) {
74 | 	json, err := json.Marshal(bookmarks)
75 | 	if err != nil {
76 | 		return nil, err
77 | 	}
78 | 	return []byte(json), nil
79 | }
80 | 
81 | func (bookmarks *HatenaBookmark) UnmarshalBinary(data []byte) error {
82 | 	err := json.Unmarshal(data, bookmarks)
83 | 	if err != nil {
84 | 		return err
85 | 	}
86 | 	return nil
87 | }
88 | 


--------------------------------------------------------------------------------
/lib/model/label_type.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | )
 6 | 
 7 | type LabelType int
 8 | 
 9 | func (lt *LabelType) MarshalBinary() ([]byte, error) {
10 | 	return json.Marshal(lt)
11 | }
12 | 
13 | func (lt *LabelType) UnmarshalBinary(data []byte) error {
14 | 	if err := json.Unmarshal(data, &lt); err != nil {
15 | 		return err
16 | 	}
17 | 	return nil
18 | }
19 | 
20 | const (
21 | 	POSITIVE  LabelType = 1
22 | 	NEGATIVE  LabelType = -1
23 | 	UNLABELED LabelType = 0
24 | )
25 | 


--------------------------------------------------------------------------------
/lib/model/recommendation.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import "fmt"
 4 | 
 5 | type RecommendationListType int
 6 | 
 7 | const (
 8 | 	GENERAL RecommendationListType = 0
 9 | 	ARTICLE RecommendationListType = 1
10 | 	GITHUB  RecommendationListType = 2
11 | 	SLIDE   RecommendationListType = 3
12 | 	ARXIV   RecommendationListType = 4
13 | 	VIDEO   RecommendationListType = 5
14 | 	EVENT   RecommendationListType = 6
15 | )
16 | 
17 | func GetRecommendationListType(listname string) (RecommendationListType, error) {
18 | 	switch listname {
19 | 	case "general":
20 | 		return GENERAL, nil
21 | 	case "article":
22 | 		return ARTICLE, nil
23 | 	case "github":
24 | 		return GITHUB, nil
25 | 	case "slide":
26 | 		return SLIDE, nil
27 | 	case "arxiv":
28 | 		return ARXIV, nil
29 | 	case "video":
30 | 		return VIDEO, nil
31 | 	case "event":
32 | 		return EVENT, nil
33 | 	default:
34 | 		return -1, fmt.Errorf("no such RecommendationListType for '%s'", listname)
35 | 	}
36 | }
37 | 
38 | type Recommendation struct {
39 | 	RecommendationListType RecommendationListType
40 | 	ExampleIds             []int
41 | }
42 | 


--------------------------------------------------------------------------------
/lib/model/related_example.go:
--------------------------------------------------------------------------------
1 | package model
2 | 
3 | type RelatedExamples struct {
4 | 	ExampleId         int
5 | 	RelatedExampleIds []int
6 | }
7 | 


--------------------------------------------------------------------------------
/lib/model/tweet.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import (
 4 | 	"time"
 5 | )
 6 | 
 7 | type Tweet struct {
 8 | 	Id        int `db:"id"`
 9 | 	ExampleId int `db:"example_id"`
10 | 
11 | 	CreatedAt     time.Time `json:"CreatedAt" db:"created_at"`
12 | 	IdStr         string    `json:"IdStr" db:"id_str"`
13 | 	FullText      string    `json:"FullText" db:"full_text"`
14 | 	FavoriteCount int       `json:"FavoriteCount" db:"favorite_count"`
15 | 	RetweetCount  int       `json:"RetweetCount" db:"retweet_count"`
16 | 	Lang          string    `json:"Lang" db:"lang"`
17 | 
18 | 	ScreenName      string    `json:"ScreenName" db:"screen_name"`
19 | 	Name            string    `json:"Name" db:"name"`
20 | 	ProfileImageUrl string    `json:"ProfileImageUrl" db:"profile_image_url"`
21 | 	Label           LabelType `json:"Label" db:"label"`
22 | 	Score           float64   `json:"Score" db:"score"`
23 | }
24 | 
25 | type ReferringTweets struct {
26 | 	Count  int      `json:"Count"`
27 | 	Tweets []*Tweet `json:"Tweets"`
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/related_example/related_example.go:
--------------------------------------------------------------------------------
  1 | package related_example
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"strconv"
  7 | 	"strings"
  8 | 
  9 | 	"os"
 10 | 
 11 | 	"github.com/syou6162/go-active-learning/lib/model"
 12 | 	"github.com/syou6162/go-active-learning/lib/service"
 13 | 	"github.com/urfave/cli"
 14 | )
 15 | 
 16 | func parseLine(line string) (int, int, error) {
 17 | 	tokens := strings.Split(line, "\t")
 18 | 	if len(tokens) == 2 {
 19 | 		exampleId, _ := strconv.ParseInt(tokens[0], 10, 0)
 20 | 		relatedExampleId, _ := strconv.ParseInt(tokens[1], 10, 0)
 21 | 		return int(exampleId), int(relatedExampleId), nil
 22 | 	}
 23 | 	return 0, 0, fmt.Errorf("Invalid line: %s", line)
 24 | }
 25 | 
 26 | func readRelatedExamples(filename string) ([]*model.RelatedExamples, error) {
 27 | 	fp, err := os.Open(filename)
 28 | 	defer fp.Close()
 29 | 	if err != nil {
 30 | 		return nil, err
 31 | 	}
 32 | 
 33 | 	exampleId2RelatedExampleIds := make(map[int][]int)
 34 | 	scanner := bufio.NewScanner(fp)
 35 | 	for scanner.Scan() {
 36 | 		line := scanner.Text()
 37 | 		exampleId, relatedExampleId, err := parseLine(line)
 38 | 		if err != nil {
 39 | 			return nil, err
 40 | 		}
 41 | 		if _, ok := exampleId2RelatedExampleIds[exampleId]; ok {
 42 | 			exampleId2RelatedExampleIds[exampleId] = append(exampleId2RelatedExampleIds[exampleId], relatedExampleId)
 43 | 		} else {
 44 | 			exampleId2RelatedExampleIds[exampleId] = []int{relatedExampleId}
 45 | 		}
 46 | 	}
 47 | 	if err := scanner.Err(); err != nil {
 48 | 		return nil, err
 49 | 	}
 50 | 	result := make([]*model.RelatedExamples, 0)
 51 | 	for exampleId, relatedExampleIds := range exampleId2RelatedExampleIds {
 52 | 		result = append(result, &model.RelatedExamples{ExampleId: exampleId, RelatedExampleIds: relatedExampleIds})
 53 | 	}
 54 | 	return result, nil
 55 | }
 56 | 
 57 | func doAddRelatedExamples(c *cli.Context) error {
 58 | 	inputFilename := c.String("input-filename")
 59 | 
 60 | 	if inputFilename == "" {
 61 | 		_ = cli.ShowCommandHelp(c, "add-related-examples")
 62 | 		return cli.NewExitError("`input-filename` is a required field.", 1)
 63 | 	}
 64 | 
 65 | 	app, err := service.NewDefaultApp()
 66 | 	if err != nil {
 67 | 		return err
 68 | 	}
 69 | 	defer app.Close()
 70 | 
 71 | 	relatedExamplesList, err := readRelatedExamples(inputFilename)
 72 | 	if err != nil {
 73 | 		return err
 74 | 	}
 75 | 	for _, relatedExamples := range relatedExamplesList {
 76 | 		for _, related := range relatedExamples.RelatedExampleIds {
 77 | 			fmt.Print(relatedExamples.ExampleId)
 78 | 			fmt.Print("\t")
 79 | 			fmt.Println(related)
 80 | 		}
 81 | 		err := app.UpdateRelatedExamples(*relatedExamples)
 82 | 		if err != nil {
 83 | 			return err
 84 | 		}
 85 | 	}
 86 | 	return nil
 87 | }
 88 | 
 89 | var CommandAddRelatedExamples = cli.Command{
 90 | 	Name:  "add-related-examples",
 91 | 	Usage: "add related examples",
 92 | 	Description: `
 93 | Add related examples.
 94 | `,
 95 | 	Action: doAddRelatedExamples,
 96 | 	Flags: []cli.Flag{
 97 | 		cli.StringFlag{Name: "input-filename"},
 98 | 	},
 99 | }
100 | 


--------------------------------------------------------------------------------
/lib/repository/example.go:
--------------------------------------------------------------------------------
  1 | package repository
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"database/sql"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"time"
  9 | 
 10 | 	"github.com/lib/pq"
 11 | 	"github.com/syou6162/go-active-learning/lib/feature"
 12 | 	"github.com/syou6162/go-active-learning/lib/model"
 13 | 	"github.com/syou6162/go-active-learning/lib/util/file"
 14 | )
 15 | 
 16 | var exampleNotFoundError = model.NotFoundError("example")
 17 | 
 18 | // データが存在しなければ追加
 19 | // データが存在する場合は、以下の場合にのみ更新する
 20 | // - ラベルが正例か負例に変更された
 21 | // - クロール対象のサイトが一時的に200以外のステータスで前回データが取得できなかった
 22 | func (r *repository) UpdateOrCreateExample(e *model.Example) error {
 23 | 	now := time.Now()
 24 | 	e.UpdatedAt = now
 25 | 	_, err := r.db.NamedExec(`
 26 | INSERT INTO example
 27 | ( url,  final_url,  title,  description,  og_description,  og_type,  og_image,  body,  score,  is_new,  status_code,  favicon,  label,  created_at,  updated_at)
 28 | VALUES
 29 | (:url, :final_url, :title, :description, :og_description, :og_type, :og_image, :body, :score, :is_new, :status_code, :favicon, :label, :created_at, :updated_at)
 30 | ON CONFLICT (url)
 31 | DO UPDATE SET
 32 | url = :url, final_url = :final_url, title = :title,
 33 | description = :description, og_description = :og_description, og_type = :og_type, og_image = :og_image,
 34 | body = :body, score = :score, is_new = :is_new, status_code = :status_code, favicon = :favicon,
 35 | label = :label, created_at = :created_at, updated_at = :updated_at
 36 | WHERE
 37 | ((EXCLUDED.label != 0) AND (example.label != EXCLUDED.label)) OR
 38 | ((example.status_code != 200) AND (EXCLUDED.status_code = 200))
 39 | ;`, e)
 40 | 	if err != nil {
 41 | 		return err
 42 | 	}
 43 | 	tmp, err := r.FindExampleByUlr(e.Url)
 44 | 	if err != nil {
 45 | 		return err
 46 | 	}
 47 | 	e.Id = tmp.Id
 48 | 	return nil
 49 | }
 50 | 
 51 | func (r *repository) UpdateScore(e *model.Example) error {
 52 | 	if _, err := r.FindExampleByUlr(e.Url); err != nil {
 53 | 		return err
 54 | 	}
 55 | 	if _, err := r.db.Exec(`UPDATE example SET score = $1, updated_at = $2 WHERE url = $3;`, e.Score, time.Now(), e.Url); err != nil {
 56 | 		return err
 57 | 	}
 58 | 	return nil
 59 | }
 60 | 
 61 | func (r *repository) IncErrorCount(e *model.Example) error {
 62 | 	errorCount, err := r.GetErrorCount(e)
 63 | 	if err != nil {
 64 | 		return err
 65 | 	}
 66 | 	if _, err := r.db.Exec(`UPDATE example SET error_count = $1, updated_at = $2 WHERE url = $3;`, errorCount+1, time.Now(), e.Url); err != nil {
 67 | 		return err
 68 | 	}
 69 | 	return nil
 70 | }
 71 | 
 72 | func (r *repository) GetErrorCount(e *model.Example) (int, error) {
 73 | 	example, err := r.FindExampleByUlr(e.Url)
 74 | 	if err != nil {
 75 | 		if err == exampleNotFoundError {
 76 | 			return 0, nil
 77 | 		}
 78 | 		return 0, err
 79 | 	}
 80 | 	return example.ErrorCount, nil
 81 | }
 82 | 
 83 | func (r *repository) UpdateFeatureVector(e *model.Example) error {
 84 | 	tmp, err := r.FindExampleByUlr(e.Url)
 85 | 	if err != nil {
 86 | 		return err
 87 | 	}
 88 | 	id := tmp.Id
 89 | 	if _, err = r.db.Exec(`DELETE FROM feature WHERE example_id = $1;`, id); err != nil {
 90 | 		return err
 91 | 	}
 92 | 	_, err = r.db.Exec(`INSERT INTO feature (example_id, feature) VALUES ($1, unnest(cast($2 AS TEXT[])));`, id, pq.Array(e.Fv))
 93 | 	return err
 94 | }
 95 | 
 96 | func (r *repository) InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) {
 97 | 	line := scanner.Text()
 98 | 	e, err := file.ParseLine(line)
 99 | 	if err != nil {
100 | 		return nil, err
101 | 	}
102 | 	err = r.UpdateOrCreateExample(e)
103 | 	if err != nil {
104 | 		return nil, err
105 | 	}
106 | 	return e, nil
107 | }
108 | 
109 | func (r *repository) InsertExamplesFromReader(reader io.Reader) error {
110 | 	scanner := bufio.NewScanner(reader)
111 | 
112 | 	for scanner.Scan() {
113 | 		_, err := r.InsertExampleFromScanner(scanner)
114 | 		if err != nil {
115 | 			return err
116 | 		}
117 | 	}
118 | 	if err := scanner.Err(); err != nil {
119 | 		return err
120 | 	}
121 | 	return nil
122 | }
123 | 
124 | func (r *repository) searchExamples(query string, args ...interface{}) (model.Examples, error) {
125 | 	examples := model.Examples{}
126 | 	err := r.db.Select(&examples, query, args...)
127 | 	if err != nil {
128 | 		return nil, err
129 | 	}
130 | 	return examples, nil
131 | }
132 | 
133 | func (r *repository) findExample(query string, args ...interface{}) (*model.Example, error) {
134 | 	e := model.Example{}
135 | 
136 | 	err := r.db.Get(&e, query, args...)
137 | 	if err != nil {
138 | 		if err == sql.ErrNoRows {
139 | 			return nil, exampleNotFoundError
140 | 		}
141 | 		return nil, err
142 | 	}
143 | 	return &e, nil
144 | }
145 | 
146 | func (r *repository) SearchExamples() (model.Examples, error) {
147 | 	query := `SELECT * FROM example;`
148 | 	return r.searchExamples(query)
149 | }
150 | 
151 | func (r *repository) SearchRecentExamples(from time.Time, limit int) (model.Examples, error) {
152 | 	query := `SELECT * FROM example WHERE created_at > $1 ORDER BY updated_at DESC LIMIT $2;`
153 | 	return r.searchExamples(query, from, limit)
154 | }
155 | 
156 | func (r *repository) SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) {
157 | 	query := `SELECT * FROM example WHERE final_url like $1 || '%' AND created_at > $2 ORDER BY updated_at DESC LIMIT $3;`
158 | 	return r.searchExamples(query, host, from, limit)
159 | }
160 | 
161 | func (r *repository) SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) {
162 | 	query := `SELECT * FROM example WHERE label = $1 ORDER BY updated_at DESC LIMIT $2;`
163 | 	return r.searchExamples(query, label, limit)
164 | }
165 | 
166 | func (r *repository) SearchLabeledExamples(limit int) (model.Examples, error) {
167 | 	query := `SELECT * FROM example WHERE label != 0 ORDER BY updated_at DESC LIMIT $1;`
168 | 	return r.searchExamples(query, limit)
169 | }
170 | 
171 | func (r *repository) SearchPositiveExamples(limit int) (model.Examples, error) {
172 | 	return r.SearchExamplesByLabel(model.POSITIVE, limit)
173 | }
174 | 
175 | func (r *repository) SearchNegativeExamples(limit int) (model.Examples, error) {
176 | 	return r.SearchExamplesByLabel(model.NEGATIVE, limit)
177 | }
178 | 
179 | func (r *repository) SearchUnlabeledExamples(limit int) (model.Examples, error) {
180 | 	return r.SearchExamplesByLabel(model.UNLABELED, limit)
181 | }
182 | 
183 | func (r *repository) SearchPositiveScoredExamples(limit int) (model.Examples, error) {
184 | 	query := `SELECT * FROM example WHERE score > 0 ORDER BY updated_at DESC LIMIT $1;`
185 | 	return r.searchExamples(query, limit)
186 | }
187 | 
188 | func (r *repository) FindExampleByUlr(url string) (*model.Example, error) {
189 | 	query := `SELECT * FROM example WHERE url = $1;`
190 | 	return r.findExample(query, url)
191 | }
192 | 
193 | // bodyなどは極めて長くなりえるので、DB側で絞って返すことができるようにする
194 | func buildSelectQuery(useTruncatedField bool) string {
195 | 	title := "title"
196 | 	description := "description"
197 | 	ogDescription := "og_description"
198 | 	body := "body"
199 | 
200 | 	if useTruncatedField {
201 | 		title = "LEFT(title, 200) AS title"
202 | 		description = "LEFT(description, 1000) AS description"
203 | 		ogDescription = "LEFT(og_description, 1000) AS og_description"
204 | 		body = "LEFT(body, 1000) AS body"
205 | 	}
206 | 	return fmt.Sprintf("SELECT id, label, url, final_url, %s, %s, %s, og_type, og_image, %s, score, is_new, status_code, favicon, error_count, created_at, updated_at", title, description, ogDescription, body)
207 | }
208 | 
209 | func (r *repository) FindExampleById(id int) (*model.Example, error) {
210 | 	query := fmt.Sprintf(`%s FROM example WHERE id = $1;`, buildSelectQuery(true))
211 | 	return r.findExample(query, id)
212 | }
213 | 
214 | func (r *repository) SearchExamplesByUlrs(urls []string) (model.Examples, error) {
215 | 	// ref: https://godoc.org/github.com/lib/pq#Array
216 | 	query := `SELECT * FROM example WHERE url = ANY($1);`
217 | 	return r.searchExamples(query, pq.Array(urls))
218 | }
219 | 
220 | func (r *repository) SearchExamplesByIds(ids []int) (model.Examples, error) {
221 | 	if len(ids) == 0 {
222 | 		return model.Examples{}, nil
223 | 	}
224 | 	query := fmt.Sprintf(`%s FROM example WHERE id = ANY($1);`, buildSelectQuery(true))
225 | 	return r.searchExamples(query, pq.Array(ids))
226 | }
227 | 
228 | func (r *repository) SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) {
229 | 	if len(keywords) == 0 {
230 | 		return model.Examples{}, nil
231 | 	}
232 | 	regexList := make([]string, 0)
233 | 	for _, w := range keywords {
234 | 		regexList = append(regexList, fmt.Sprintf(`.*%s.*`, w))
235 | 	}
236 | 	query := fmt.Sprintf(`%s FROM example WHERE title ~* %s($1) AND label != -1 ORDER BY (label, score) DESC LIMIT $2;`, buildSelectQuery(true), aggregator)
237 | 	return r.searchExamples(query, pq.Array(regexList), limit)
238 | }
239 | 
240 | func (r *repository) countExamplesByLabel(label model.LabelType) (int, error) {
241 | 	cnt := 0
242 | 	err := r.db.Get(&cnt, `SELECT COUNT(*) FROM example WHERE label = $1`, label)
243 | 	if err != nil {
244 | 		return 0, err
245 | 	}
246 | 	return cnt, nil
247 | }
248 | 
249 | func (r *repository) CountPositiveExamples() (int, error) {
250 | 	return r.countExamplesByLabel(model.POSITIVE)
251 | }
252 | 
253 | func (r *repository) CountNegativeExamples() (int, error) {
254 | 	return r.countExamplesByLabel(model.NEGATIVE)
255 | }
256 | 
257 | func (r *repository) CountUnlabeledExamples() (int, error) {
258 | 	return r.countExamplesByLabel(model.UNLABELED)
259 | }
260 | 
261 | func (r *repository) FindFeatureVector(e *model.Example) (feature.FeatureVector, error) {
262 | 	fv := feature.FeatureVector{}
263 | 	tmp, err := r.FindExampleByUlr(e.Url)
264 | 	if err != nil {
265 | 		return fv, err
266 | 	}
267 | 	id := tmp.Id
268 | 	query := `SELECT feature FROM feature WHERE example_id = $1;`
269 | 	err = r.db.Select(&fv, query, id)
270 | 	if err != nil {
271 | 		return fv, err
272 | 	}
273 | 	return fv, nil
274 | }
275 | 
276 | func (r *repository) SearchFeatureVector(examples model.Examples) (map[int]feature.FeatureVector, error) {
277 | 	type Pair struct {
278 | 		ExampleId int    `db:"example_id"`
279 | 		Feature   string `db:"feature"`
280 | 	}
281 | 
282 | 	fvById := make(map[int]feature.FeatureVector)
283 | 	urls := make([]string, 0)
284 | 	for _, e := range examples {
285 | 		urls = append(urls, e.Url)
286 | 	}
287 | 
288 | 	tmp, err := r.SearchExamplesByUlrs(urls)
289 | 	if err != nil {
290 | 		return fvById, err
291 | 	}
292 | 	ids := make([]int, 0)
293 | 	for _, e := range tmp {
294 | 		ids = append(ids, e.Id)
295 | 	}
296 | 
297 | 	query := `SELECT example_id, feature FROM feature WHERE example_id = ANY($1);`
298 | 	pairs := make([]Pair, 0)
299 | 	err = r.db.Select(&pairs, query, pq.Array(ids))
300 | 	if err != nil {
301 | 		return fvById, err
302 | 	}
303 | 
304 | 	for _, pair := range pairs {
305 | 		fvById[pair.ExampleId] = append(fvById[pair.ExampleId], pair.Feature)
306 | 	}
307 | 	return fvById, nil
308 | }
309 | 
310 | func (r *repository) DeleteAllExamples() error {
311 | 	_, err := r.db.Exec(`DELETE FROM example;`)
312 | 	return err
313 | }
314 | 


--------------------------------------------------------------------------------
/lib/repository/example_test.go:
--------------------------------------------------------------------------------
  1 | package repository_test
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"os"
  6 | 	"testing"
  7 | 	"time"
  8 | 
  9 | 	"github.com/syou6162/go-active-learning/lib/example"
 10 | 	"github.com/syou6162/go-active-learning/lib/feature"
 11 | 	"github.com/syou6162/go-active-learning/lib/model"
 12 | 	"github.com/syou6162/go-active-learning/lib/repository"
 13 | )
 14 | 
 15 | func TestMain(m *testing.M) {
 16 | 	repo, err := repository.New()
 17 | 	if err != nil {
 18 | 		log.Fatal(err.Error())
 19 | 	}
 20 | 	defer repo.Close()
 21 | 
 22 | 	ret := m.Run()
 23 | 	os.Exit(ret)
 24 | }
 25 | 
 26 | func TestPing(t *testing.T) {
 27 | 	repo, err := repository.New()
 28 | 	if err != nil {
 29 | 		t.Errorf(err.Error())
 30 | 	}
 31 | 	defer repo.Close()
 32 | 
 33 | 	if err := repo.Ping(); err != nil {
 34 | 		t.Errorf(err.Error())
 35 | 	}
 36 | }
 37 | 
 38 | func TestInsertExamplesFromReader(t *testing.T) {
 39 | 	repo, err := repository.New()
 40 | 	if err != nil {
 41 | 		t.Errorf(err.Error())
 42 | 	}
 43 | 	defer repo.Close()
 44 | 
 45 | 	if err = repo.DeleteAllExamples(); err != nil {
 46 | 		t.Error(err)
 47 | 	}
 48 | 
 49 | 	fp, err := os.Open("../../tech_input_example.txt")
 50 | 	defer fp.Close()
 51 | 	if err != nil {
 52 | 		t.Error(err)
 53 | 	}
 54 | 	repo.InsertExamplesFromReader(fp)
 55 | 
 56 | 	examples, err := repo.SearchExamples()
 57 | 	if err != nil {
 58 | 		t.Error(err)
 59 | 	}
 60 | 	if len(examples) == 0 {
 61 | 		t.Errorf("len(examples) > 0, but %d", len(examples))
 62 | 	}
 63 | }
 64 | 
 65 | func TestInsertOrUpdateExample(t *testing.T) {
 66 | 	repo, err := repository.New()
 67 | 	if err != nil {
 68 | 		t.Errorf(err.Error())
 69 | 	}
 70 | 	defer repo.Close()
 71 | 
 72 | 	if err = repo.DeleteAllExamples(); err != nil {
 73 | 		t.Error(err)
 74 | 	}
 75 | 
 76 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.UNLABELED))
 77 | 	if err != nil {
 78 | 		t.Error(err)
 79 | 	}
 80 | 
 81 | 	examples, err := repo.SearchExamples()
 82 | 	if err != nil {
 83 | 		t.Error(err)
 84 | 	}
 85 | 	if len(examples) != 1 {
 86 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
 87 | 	}
 88 | 	if examples[0].Label != model.UNLABELED {
 89 | 		t.Errorf("label == %d, want 0", examples[0].Label)
 90 | 	}
 91 | 	if examples[0].Id == 0 {
 92 | 		t.Error("id must not be 0")
 93 | 	}
 94 | 
 95 | 	// same url
 96 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.NEGATIVE))
 97 | 	if err != nil {
 98 | 		t.Error(err)
 99 | 	}
100 | 
101 | 	examples, err = repo.SearchExamples()
102 | 	if err != nil {
103 | 		t.Error(err)
104 | 	}
105 | 	if len(examples) != 1 {
106 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
107 | 	}
108 | 	if examples[0].Label != model.NEGATIVE {
109 | 		t.Errorf("label == %d, want -1", examples[0].Label)
110 | 	}
111 | 
112 | 	// same url but different label
113 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.POSITIVE))
114 | 	if err != nil {
115 | 		t.Error(err)
116 | 	}
117 | 
118 | 	examples, err = repo.SearchExamples()
119 | 	if err != nil {
120 | 		t.Error(err)
121 | 	}
122 | 	if len(examples) != 1 {
123 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
124 | 	}
125 | 	if examples[0].Label != model.POSITIVE {
126 | 		t.Errorf("label == %d, want 1", examples[0].Label)
127 | 	}
128 | 
129 | 	// cannot update to unlabeled
130 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge.com", model.UNLABELED))
131 | 	if err != nil {
132 | 		t.Error(err)
133 | 	}
134 | 
135 | 	examples, err = repo.SearchExamples()
136 | 	if err != nil {
137 | 		t.Error(err)
138 | 	}
139 | 	if len(examples) != 1 {
140 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
141 | 	}
142 | 	if examples[0].Label != model.POSITIVE {
143 | 		t.Errorf("label == %d, want 1", examples[0].Label)
144 | 	}
145 | 
146 | 	// different url
147 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://another.com", model.NEGATIVE))
148 | 	if err != nil {
149 | 		t.Error(err)
150 | 	}
151 | 
152 | 	examples, err = repo.SearchExamples()
153 | 	if err != nil {
154 | 		t.Error(err)
155 | 	}
156 | 	if len(examples) != 2 {
157 | 		t.Errorf("len(examples) == %d, want 2", len(examples))
158 | 	}
159 | }
160 | 
161 | func TestUpdateScore(t *testing.T) {
162 | 	repo, err := repository.New()
163 | 	if err != nil {
164 | 		t.Errorf(err.Error())
165 | 	}
166 | 	defer repo.Close()
167 | 
168 | 	if err = repo.DeleteAllExamples(); err != nil {
169 | 		t.Error(err)
170 | 	}
171 | 
172 | 	url := "http://hoge.com"
173 | 	e := example.NewExample(url, model.UNLABELED)
174 | 	e.Score = 1.0
175 | 	err = repo.UpdateOrCreateExample(e)
176 | 	if err != nil {
177 | 		t.Error(err)
178 | 	}
179 | 
180 | 	e, err = repo.FindExampleByUlr(url)
181 | 	if err != nil {
182 | 		t.Error(err)
183 | 	}
184 | 	if e.Score != 1.0 {
185 | 		t.Errorf("e.Score == %f, want 1.0", e.Score)
186 | 	}
187 | 
188 | 	e.Score = 100.0
189 | 	err = repo.UpdateScore(e)
190 | 	if err != nil {
191 | 		t.Error(err)
192 | 	}
193 | 
194 | 	e, err = repo.FindExampleByUlr(url)
195 | 	if err != nil {
196 | 		t.Error(err)
197 | 	}
198 | 	if e.Score != 100.0 {
199 | 		t.Errorf("e.Score == %f, want 100.0", e.Score)
200 | 	}
201 | }
202 | 
203 | func TestErrorCount(t *testing.T) {
204 | 	repo, err := repository.New()
205 | 	if err != nil {
206 | 		t.Errorf(err.Error())
207 | 	}
208 | 	defer repo.Close()
209 | 
210 | 	if err = repo.DeleteAllExamples(); err != nil {
211 | 		t.Error(err)
212 | 	}
213 | 
214 | 	existingUrl := example.NewExample("https://github.com", model.POSITIVE)
215 | 	nonExistingUrl := example.NewExample("http://hoge.fuga", model.NEGATIVE)
216 | 	examples := model.Examples{existingUrl, nonExistingUrl}
217 | 
218 | 	for _, e := range examples {
219 | 		if err := repo.UpdateOrCreateExample(e); err != nil {
220 | 			t.Error(err)
221 | 		}
222 | 
223 | 		cnt, err := repo.GetErrorCount(e)
224 | 		if err != nil {
225 | 			t.Errorf("Cannot get error count: %s", err.Error())
226 | 		}
227 | 		if cnt != 0 {
228 | 			t.Errorf("Error count must be 0 for %s", e.Url)
229 | 		}
230 | 	}
231 | 
232 | 	for _, e := range examples {
233 | 		err := repo.IncErrorCount(e)
234 | 		if err != nil {
235 | 			t.Errorf("Cannot get error count: %s", err.Error())
236 | 		}
237 | 	}
238 | 
239 | 	for _, e := range examples {
240 | 		cnt, err := repo.GetErrorCount(e)
241 | 		if err != nil {
242 | 			t.Errorf("Cannot get error count: %s", err.Error())
243 | 		}
244 | 		if cnt != 1 {
245 | 			t.Errorf("Error count must be 1 for %s", e.Url)
246 | 		}
247 | 	}
248 | }
249 | 
250 | func TestReadLabeledExamples(t *testing.T) {
251 | 	repo, err := repository.New()
252 | 	if err != nil {
253 | 		t.Errorf(err.Error())
254 | 	}
255 | 	defer repo.Close()
256 | 
257 | 	if err = repo.DeleteAllExamples(); err != nil {
258 | 		t.Error(err)
259 | 	}
260 | 
261 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
262 | 	if err != nil {
263 | 		t.Error(err)
264 | 	}
265 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
266 | 	if err != nil {
267 | 		t.Error(err)
268 | 	}
269 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
270 | 	if err != nil {
271 | 		t.Error(err)
272 | 	}
273 | 
274 | 	examples, err := repo.SearchLabeledExamples(10)
275 | 	if err != nil {
276 | 		t.Error(err)
277 | 	}
278 | 	if len(examples) != 2 {
279 | 		t.Errorf("len(examples) == %d, want 2", len(examples))
280 | 	}
281 | }
282 | 
283 | func TestReadRecentExamples(t *testing.T) {
284 | 	repo, err := repository.New()
285 | 	if err != nil {
286 | 		t.Errorf(err.Error())
287 | 	}
288 | 	defer repo.Close()
289 | 
290 | 	if err = repo.DeleteAllExamples(); err != nil {
291 | 		t.Error(err)
292 | 	}
293 | 
294 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
295 | 	if err != nil {
296 | 		t.Error(err)
297 | 	}
298 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
299 | 	if err != nil {
300 | 		t.Error(err)
301 | 	}
302 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
303 | 	if err != nil {
304 | 		t.Error(err)
305 | 	}
306 | 
307 | 	examples, err := repo.SearchRecentExamples(time.Now().Add(time.Duration(-10)*time.Minute), 10)
308 | 	if err != nil {
309 | 		t.Error(err)
310 | 	}
311 | 	if len(examples) != 3 {
312 | 		t.Errorf("len(examples) == %d, want 3", len(examples))
313 | 	}
314 | }
315 | 
316 | func TestReadRecentExamplesByHost(t *testing.T) {
317 | 	repo, err := repository.New()
318 | 	if err != nil {
319 | 		t.Errorf(err.Error())
320 | 	}
321 | 	defer repo.Close()
322 | 
323 | 	if err = repo.DeleteAllExamples(); err != nil {
324 | 		t.Error(err)
325 | 	}
326 | 
327 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
328 | 	if err != nil {
329 | 		t.Error(err)
330 | 	}
331 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
332 | 	if err != nil {
333 | 		t.Error(err)
334 | 	}
335 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
336 | 	if err != nil {
337 | 		t.Error(err)
338 | 	}
339 | 
340 | 	examples, err := repo.SearchRecentExamplesByHost("http://hoge1.com", time.Now().Add(time.Duration(-10)*time.Minute), 10)
341 | 	if err != nil {
342 | 		t.Error(err)
343 | 	}
344 | 	if len(examples) != 1 {
345 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
346 | 	}
347 | }
348 | 
349 | func TestSearchExamplesByUlr(t *testing.T) {
350 | 	repo, err := repository.New()
351 | 	if err != nil {
352 | 		t.Errorf(err.Error())
353 | 	}
354 | 	defer repo.Close()
355 | 
356 | 	if err = repo.DeleteAllExamples(); err != nil {
357 | 		t.Error(err)
358 | 	}
359 | 
360 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.NEGATIVE))
361 | 	if err != nil {
362 | 		t.Error(err)
363 | 	}
364 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
365 | 	if err != nil {
366 | 		t.Error(err)
367 | 	}
368 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
369 | 	if err != nil {
370 | 		t.Error(err)
371 | 	}
372 | 
373 | 	example, err := repo.FindExampleByUlr("http://hoge1.com")
374 | 	if err != nil {
375 | 		t.Error(err)
376 | 	}
377 | 	if example.Url == "" {
378 | 		t.Errorf("example.Url == %s, want http://hoge1.com", example.Url)
379 | 	}
380 | 
381 | 	example, err = repo.FindExampleByUlr("http://hoge4.com")
382 | 	if err == nil {
383 | 		t.Errorf("search result must be nil")
384 | 	}
385 | }
386 | 
387 | func TestSearchExamplesByUlrs(t *testing.T) {
388 | 	repo, err := repository.New()
389 | 	if err != nil {
390 | 		t.Errorf(err.Error())
391 | 	}
392 | 	defer repo.Close()
393 | 
394 | 	if err = repo.DeleteAllExamples(); err != nil {
395 | 		t.Error(err)
396 | 	}
397 | 
398 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.NEGATIVE))
399 | 	if err != nil {
400 | 		t.Error(err)
401 | 	}
402 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
403 | 	if err != nil {
404 | 		t.Error(err)
405 | 	}
406 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
407 | 	if err != nil {
408 | 		t.Error(err)
409 | 	}
410 | 
411 | 	examples, err := repo.SearchExamplesByUlrs([]string{"http://hoge1.com", "http://hoge2.com"})
412 | 	if err != nil {
413 | 		t.Error(err)
414 | 	}
415 | 	if len(examples) != 2 {
416 | 		t.Errorf("len(examples) == %d, want 2", len(examples))
417 | 	}
418 | }
419 | 
420 | func TestSearchExamplesByLabels(t *testing.T) {
421 | 	repo, err := repository.New()
422 | 	if err != nil {
423 | 		t.Errorf(err.Error())
424 | 	}
425 | 	defer repo.Close()
426 | 
427 | 	if err = repo.DeleteAllExamples(); err != nil {
428 | 		t.Error(err)
429 | 	}
430 | 
431 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
432 | 	if err != nil {
433 | 		t.Error(err)
434 | 	}
435 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
436 | 	if err != nil {
437 | 		t.Error(err)
438 | 	}
439 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
440 | 	if err != nil {
441 | 		t.Error(err)
442 | 	}
443 | 
444 | 	examples, err := repo.SearchPositiveExamples(10)
445 | 	if err != nil {
446 | 		t.Error(err)
447 | 	}
448 | 	if len(examples) != 1 {
449 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
450 | 	}
451 | 
452 | 	examples, err = repo.SearchNegativeExamples(10)
453 | 	if err != nil {
454 | 		t.Error(err)
455 | 	}
456 | 	if len(examples) != 1 {
457 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
458 | 	}
459 | 
460 | 	examples, err = repo.SearchUnlabeledExamples(10)
461 | 	if err != nil {
462 | 		t.Error(err)
463 | 	}
464 | 	if len(examples) != 1 {
465 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
466 | 	}
467 | }
468 | 
469 | func TestCountExamplesByLabels(t *testing.T) {
470 | 	repo, err := repository.New()
471 | 	if err != nil {
472 | 		t.Errorf(err.Error())
473 | 	}
474 | 	defer repo.Close()
475 | 
476 | 	if err = repo.DeleteAllExamples(); err != nil {
477 | 		t.Error(err)
478 | 	}
479 | 
480 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge1.com", model.POSITIVE))
481 | 	if err != nil {
482 | 		t.Error(err)
483 | 	}
484 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge2.com", model.NEGATIVE))
485 | 	if err != nil {
486 | 		t.Error(err)
487 | 	}
488 | 	err = repo.UpdateOrCreateExample(example.NewExample("http://hoge3.com", model.UNLABELED))
489 | 	if err != nil {
490 | 		t.Error(err)
491 | 	}
492 | 
493 | 	cnt, err := repo.CountPositiveExamples()
494 | 	if err != nil {
495 | 		t.Error(err)
496 | 	}
497 | 	if cnt != 1 {
498 | 		t.Errorf("len(posExamples) == %d, want 1", cnt)
499 | 	}
500 | 
501 | 	cnt, err = repo.CountNegativeExamples()
502 | 	if err != nil {
503 | 		t.Error(err)
504 | 	}
505 | 	if cnt != 1 {
506 | 		t.Errorf("len(negExamples) == %d, want 1", cnt)
507 | 	}
508 | 
509 | 	cnt, err = repo.CountUnlabeledExamples()
510 | 	if err != nil {
511 | 		t.Error(err)
512 | 	}
513 | 	if cnt != 1 {
514 | 		t.Errorf("len(unlabeledExamples) == %d, want 1", cnt)
515 | 	}
516 | }
517 | 
518 | func TestFeatureVectorReadWrite(t *testing.T) {
519 | 	repo, err := repository.New()
520 | 	if err != nil {
521 | 		t.Errorf(err.Error())
522 | 	}
523 | 	defer repo.Close()
524 | 
525 | 	if err = repo.DeleteAllExamples(); err != nil {
526 | 		t.Error(err)
527 | 	}
528 | 
529 | 	e1 := example.NewExample("http://hoge.com", model.UNLABELED)
530 | 	err = repo.UpdateOrCreateExample(e1)
531 | 	if err != nil {
532 | 		t.Error(err)
533 | 	}
534 | 	e1.Fv = feature.FeatureVector{"BIAS"}
535 | 
536 | 	if err = repo.UpdateFeatureVector(e1); err != nil {
537 | 		t.Error(err)
538 | 	}
539 | 
540 | 	fv, err := repo.FindFeatureVector(e1)
541 | 	if err != nil {
542 | 		t.Error(err)
543 | 	}
544 | 	if len(fv) != 1 {
545 | 		t.Errorf("len(fv) == %d, want 1", len(fv))
546 | 	}
547 | 
548 | 	e2 := example.NewExample("http://fuga.com", model.UNLABELED)
549 | 	err = repo.UpdateOrCreateExample(e2)
550 | 	if err != nil {
551 | 		t.Error(err)
552 | 	}
553 | 	e2.Fv = feature.FeatureVector{"hoge"}
554 | 	if err = repo.UpdateFeatureVector(e2); err != nil {
555 | 		t.Error(err)
556 | 	}
557 | 	fvList, err := repo.SearchFeatureVector(model.Examples{e1, e2})
558 | 	if err != nil {
559 | 		t.Error(err)
560 | 	}
561 | 	if len(fvList) != 2 {
562 | 		t.Errorf("len(fvList) == %d, want 2", len(fvList))
563 | 	}
564 | 	if fvList[e2.Id][0] != "hoge" {
565 | 		t.Errorf("fvList[e2.Id][0] == %s, want hoge", fvList[e2.Id][0])
566 | 	}
567 | }
568 | 
569 | func TestSearchExamplesByWords(t *testing.T) {
570 | 	repo, err := repository.New()
571 | 	if err != nil {
572 | 		t.Errorf(err.Error())
573 | 	}
574 | 	defer repo.Close()
575 | 
576 | 	if err = repo.DeleteAllExamples(); err != nil {
577 | 		t.Error(err)
578 | 	}
579 | 
580 | 	e1 := example.NewExample("http://hoge.com", model.UNLABELED)
581 | 	e1.Title = "日本語"
582 | 	err = repo.UpdateOrCreateExample(e1)
583 | 	if err != nil {
584 | 		t.Error(err)
585 | 	}
586 | 
587 | 	e2 := example.NewExample("http://fuga.com", model.UNLABELED)
588 | 	e2.Title = "英語"
589 | 	err = repo.UpdateOrCreateExample(e2)
590 | 	if err != nil {
591 | 		t.Error(err)
592 | 	}
593 | 
594 | 	examples, err := repo.SearchExamplesByKeywords([]string{"日本語"}, "ALL", 100)
595 | 	if len(examples) != 1 {
596 | 		t.Errorf("len(examples) == %d, want 1", len(examples))
597 | 	}
598 | 	examples, err = repo.SearchExamplesByKeywords([]string{"語"}, "ALL", 100)
599 | 	if len(examples) != 2 {
600 | 		t.Errorf("len(examples) == %d, want 2", len(examples))
601 | 	}
602 | 	examples, err = repo.SearchExamplesByKeywords([]string{"日本語", "英語"}, "ALL", 100)
603 | 	if len(examples) != 0 {
604 | 		t.Errorf("len(examples) == %d, want 0", len(examples))
605 | 	}
606 | 	examples, err = repo.SearchExamplesByKeywords([]string{"日本語", "英語"}, "ANY", 100)
607 | 	if len(examples) != 2 {
608 | 		t.Errorf("len(examples) == %d, want 2", len(examples))
609 | 	}
610 | }
611 | 


--------------------------------------------------------------------------------
/lib/repository/hatena_bookmark.go:
--------------------------------------------------------------------------------
  1 | package repository
  2 | 
  3 | import (
  4 | 	"github.com/lib/pq"
  5 | 	"github.com/syou6162/go-active-learning/lib/model"
  6 | )
  7 | 
  8 | var hatenaBookmarkNotFoundError = model.NotFoundError("hatenaBookmark")
  9 | 
 10 | func (r *repository) UpdateHatenaBookmark(e *model.Example) error {
 11 | 	if e.HatenaBookmark == nil || e.HatenaBookmark.Count == 0 {
 12 | 		return nil
 13 | 	}
 14 | 
 15 | 	tmp, err := r.FindExampleByUlr(e.Url)
 16 | 	if err != nil {
 17 | 		return err
 18 | 	}
 19 | 	id := tmp.Id
 20 | 
 21 | 	e.HatenaBookmark.ExampleId = id
 22 | 	if _, err = r.db.NamedExec(`
 23 | INSERT INTO hatena_bookmark
 24 | ( example_id,  title,  screenshot,  entry_url,  count,  url,  eid)
 25 | VALUES
 26 | (:example_id, :title, :screenshot, :entry_url, :count, :url, :eid)
 27 | ON CONFLICT (example_id)
 28 | DO UPDATE SET
 29 | title = :title, count = :count
 30 | ;`, e.HatenaBookmark); err != nil {
 31 | 		return err
 32 | 	}
 33 | 
 34 | 	hb := model.HatenaBookmark{}
 35 | 	if err = r.db.Get(&hb, `SELECT id FROM hatena_bookmark WHERE example_id = $1;`, id); err != nil {
 36 | 		return err
 37 | 	}
 38 | 
 39 | 	for _, b := range e.HatenaBookmark.Bookmarks {
 40 | 		b.HatenaBookmarkId = hb.Id
 41 | 		if _, err = r.db.NamedExec(`
 42 | INSERT INTO bookmark
 43 | (hatena_bookmark_id, "user", comment, timestamp, tags)
 44 | VALUES
 45 | (:hatena_bookmark_id, :user, :comment, :timestamp, :tags)
 46 | ON CONFLICT (hatena_bookmark_id, "user") DO NOTHING
 47 | ;`, b); err != nil {
 48 | 			return err
 49 | 		}
 50 | 	}
 51 | 	return nil
 52 | }
 53 | 
 54 | func (r *repository) SearchHatenaBookmarks(examples model.Examples, limitForEachExample int) ([]*model.HatenaBookmark, error) {
 55 | 	hatenaBookmarks := make([]*model.HatenaBookmark, 0)
 56 | 	exampleIds := make([]int, 0)
 57 | 	for _, e := range examples {
 58 | 		exampleIds = append(exampleIds, e.Id)
 59 | 	}
 60 | 
 61 | 	query := `SELECT * FROM hatena_bookmark WHERE example_id = ANY($1);`
 62 | 	err := r.db.Select(&hatenaBookmarks, query, pq.Array(exampleIds))
 63 | 	if err != nil {
 64 | 		return hatenaBookmarks, err
 65 | 	}
 66 | 
 67 | 	hatenaBookmarkIds := make([]int, 0)
 68 | 	for _, hb := range hatenaBookmarks {
 69 | 		hatenaBookmarkIds = append(hatenaBookmarkIds, hb.Id)
 70 | 		hb.Bookmarks = make([]*model.Bookmark, 0)
 71 | 	}
 72 | 	if limitForEachExample == 0 {
 73 | 		return hatenaBookmarks, nil
 74 | 	}
 75 | 
 76 | 	bookmarks := make([]*model.Bookmark, 0)
 77 | 	query = `SELECT * FROM bookmark WHERE hatena_bookmark_id = ANY($1) ORDER BY timestamp LIMIT $2;`
 78 | 	err = r.db.Select(&bookmarks, query, pq.Array(hatenaBookmarkIds), limitForEachExample)
 79 | 	if err != nil {
 80 | 		return hatenaBookmarks, err
 81 | 	}
 82 | 
 83 | 	bookmarksByHatenaBookmarkId := make(map[int][]*model.Bookmark)
 84 | 	for _, b := range bookmarks {
 85 | 		bookmarksByHatenaBookmarkId[b.HatenaBookmarkId] = append(bookmarksByHatenaBookmarkId[b.HatenaBookmarkId], b)
 86 | 	}
 87 | 
 88 | 	result := make([]*model.HatenaBookmark, 0)
 89 | 	for _, hb := range hatenaBookmarks {
 90 | 		bookmarks := bookmarksByHatenaBookmarkId[hb.Id]
 91 | 		hb.Bookmarks = bookmarks
 92 | 		result = append(result, hb)
 93 | 	}
 94 | 	return result, nil
 95 | }
 96 | 
 97 | func (r *repository) FindHatenaBookmark(e *model.Example, limit int) (*model.HatenaBookmark, error) {
 98 | 	hatenaBookmark := &model.HatenaBookmark{}
 99 | 
100 | 	query := `SELECT * FROM hatena_bookmark WHERE example_id = $1;`
101 | 	err := r.db.Get(hatenaBookmark, query, e.Id)
102 | 	if err != nil {
103 | 		return hatenaBookmark, err
104 | 	}
105 | 
106 | 	bookmarks := make([]*model.Bookmark, 0)
107 | 	if limit == 0 {
108 | 		hatenaBookmark.Bookmarks = bookmarks
109 | 		return hatenaBookmark, nil
110 | 	}
111 | 
112 | 	hatenaBookmarkId := hatenaBookmark.Id
113 | 	query = `SELECT * FROM bookmark WHERE hatena_bookmark_id = $1 ORDER BY timestamp LIMIT $2;`
114 | 	err = r.db.Select(&bookmarks, query, hatenaBookmarkId, limit)
115 | 	if err != nil {
116 | 		return hatenaBookmark, err
117 | 	}
118 | 
119 | 	hatenaBookmark.Bookmarks = bookmarks
120 | 	return hatenaBookmark, nil
121 | }
122 | 


--------------------------------------------------------------------------------
/lib/repository/hatena_bookmark_test.go:
--------------------------------------------------------------------------------
 1 | package repository_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | 
 7 | 	"github.com/syou6162/go-active-learning/lib/example"
 8 | 	"github.com/syou6162/go-active-learning/lib/model"
 9 | 	"github.com/syou6162/go-active-learning/lib/repository"
10 | )
11 | 
12 | func TestUpdateHatenaBookmark(t *testing.T) {
13 | 	repo, err := repository.New()
14 | 	if err != nil {
15 | 		t.Errorf(err.Error())
16 | 	}
17 | 	defer repo.Close()
18 | 
19 | 	if err = repo.DeleteAllExamples(); err != nil {
20 | 		t.Error(err)
21 | 	}
22 | 
23 | 	e := example.NewExample("http://hoge.com", model.UNLABELED)
24 | 	err = repo.UpdateOrCreateExample(e)
25 | 	if err != nil {
26 | 		t.Error(err)
27 | 	}
28 | 	now := time.Now()
29 | 	b1 := model.Bookmark{
30 | 		User:      "syou6162",
31 | 		Comment:   "面白いサイトですね",
32 | 		Timestamp: model.HatenaBookmarkTime{Time: &now},
33 | 		Tags:      model.Tags{"hack"},
34 | 	}
35 | 	hb := model.HatenaBookmark{
36 | 		ExampleId: e.Id,
37 | 		Title:     "hoge",
38 | 		Count:     10,
39 | 		Bookmarks: []*model.Bookmark{&b1},
40 | 	}
41 | 	e.HatenaBookmark = &hb
42 | 	if err = repo.UpdateHatenaBookmark(e); err != nil {
43 | 		t.Error(err)
44 | 	}
45 | 
46 | 	{
47 | 		result, err := repo.SearchHatenaBookmarks(model.Examples{e}, 10)
48 | 		if err != nil {
49 | 			t.Error(err)
50 | 		}
51 | 
52 | 		for _, tmp := range result {
53 | 			if tmp.Title == "" {
54 | 				t.Error("Title must not be empty")
55 | 			}
56 | 			for _, b := range tmp.Bookmarks {
57 | 				if b.User == "" {
58 | 					t.Error("User must not be empty")
59 | 				}
60 | 				if len(b.Tags) == 0 {
61 | 					t.Error("Tags must not be empty")
62 | 				}
63 | 			}
64 | 		}
65 | 	}
66 | 
67 | 	{
68 | 		result, err := repo.FindHatenaBookmark(e, 10)
69 | 		if err != nil {
70 | 			t.Error(err)
71 | 		}
72 | 
73 | 		if result.Title == "" {
74 | 			t.Error("Title must not be empty")
75 | 		}
76 | 		for _, b := range result.Bookmarks {
77 | 			if b.User == "" {
78 | 				t.Error("User must not be empty")
79 | 			}
80 | 			if len(b.Tags) == 0 {
81 | 				t.Error("Tags must not be empty")
82 | 			}
83 | 		}
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/lib/repository/mira.go:
--------------------------------------------------------------------------------
 1 | package repository
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 7 | )
 8 | 
 9 | func (r *repository) InsertMIRAModel(m classifier.MIRAClassifier) error {
10 | 	bytes, err := json.Marshal(m)
11 | 	if err != nil {
12 | 		return err
13 | 	}
14 | 	query := `INSERT INTO model (model_type, model, c, accuracy, precision, recall, fvalue) VALUES ($1, $2, $3, $4, $5, $6, $7);`
15 | 	if _, err := r.db.Exec(query, m.ModelType, string(bytes), m.C, m.Accuracy, m.Precision, m.Recall, m.Fvalue); err != nil {
16 | 		return err
17 | 	}
18 | 	return nil
19 | }
20 | 
21 | func (r *repository) FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) {
22 | 	type Classifier struct {
23 | 		Model string
24 | 	}
25 | 	tmp := Classifier{}
26 | 
27 | 	query := `SELECT model FROM model WHERE model_type = $1 ORDER BY created_at DESC LIMIT 1;`
28 | 	err := r.db.Get(&tmp, query, modelType)
29 | 	if err != nil {
30 | 		return nil, err
31 | 	}
32 | 
33 | 	clf := classifier.MIRAClassifier{}
34 | 	if err := json.Unmarshal(([]byte)(tmp.Model), &clf); err != nil {
35 | 		return nil, err
36 | 	}
37 | 	return &clf, nil
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/repository/mira_test.go:
--------------------------------------------------------------------------------
 1 | package repository_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 7 | 	"github.com/syou6162/go-active-learning/lib/repository"
 8 | )
 9 | 
10 | func TestInsertMIRAModel(t *testing.T) {
11 | 	repo, err := repository.New()
12 | 	if err != nil {
13 | 		t.Errorf(err.Error())
14 | 	}
15 | 	defer repo.Close()
16 | 
17 | 	weight := make(map[string]float64)
18 | 	weight["hoge"] = 1.0
19 | 	weight["fuga"] = 1.0
20 | 	clf := classifier.MIRAClassifier{classifier.EXAMPLE, weight, 10.0, 0.0, 0.0, 0.0, 0.0}
21 | 	err = repo.InsertMIRAModel(clf)
22 | 	if err != nil {
23 | 		t.Error(err)
24 | 	}
25 | 
26 | 	{
27 | 		clf, err := repo.FindLatestMIRAModel(classifier.EXAMPLE)
28 | 		if err != nil {
29 | 			t.Error(err)
30 | 		}
31 | 		if len(clf.Weight) == 0 {
32 | 			t.Error("weight must not be empty")
33 | 		}
34 | 		if clf.C != 10.0 {
35 | 			t.Error("C must be 10.0")
36 | 		}
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/repository/recommendation.go:
--------------------------------------------------------------------------------
 1 | package repository
 2 | 
 3 | import (
 4 | 	"github.com/lib/pq"
 5 | 	"github.com/syou6162/go-active-learning/lib/model"
 6 | )
 7 | 
 8 | func (r *repository) UpdateRecommendation(rec model.Recommendation) error {
 9 | 	if _, err := r.db.Exec(`DELETE FROM recommendation WHERE list_type = $1;`, rec.RecommendationListType); err != nil {
10 | 		return err
11 | 	}
12 | 	if _, err := r.db.Exec(`INSERT INTO recommendation (list_type, example_id) VALUES ($1, unnest(cast($2 AS INT[])));`, rec.RecommendationListType, pq.Array(rec.ExampleIds)); err != nil {
13 | 		return err
14 | 	}
15 | 	return nil
16 | }
17 | 
18 | func (r *repository) FindRecommendation(t model.RecommendationListType) (*model.Recommendation, error) {
19 | 	rec := &model.Recommendation{RecommendationListType: t}
20 | 	items := make([]int, 0)
21 | 	query := `SELECT example_id FROM recommendation WHERE list_type = $1;`
22 | 	err := r.db.Select(&items, query, t)
23 | 	if err != nil {
24 | 		return nil, err
25 | 	}
26 | 	rec.ExampleIds = items
27 | 	return rec, nil
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/repository/recommendation_test.go:
--------------------------------------------------------------------------------
 1 | package repository_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/example"
 7 | 	"github.com/syou6162/go-active-learning/lib/model"
 8 | 	"github.com/syou6162/go-active-learning/lib/repository"
 9 | )
10 | 
11 | func TestUpdateRecommendation(t *testing.T) {
12 | 	repo, err := repository.New()
13 | 	if err != nil {
14 | 		t.Errorf(err.Error())
15 | 	}
16 | 	defer repo.Close()
17 | 
18 | 	e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
19 | 	e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
20 | 	e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
21 | 	examples := model.Examples{e1, e2, e3}
22 | 	for _, e := range examples {
23 | 		err = repo.UpdateOrCreateExample(e)
24 | 		if err != nil {
25 | 			t.Error(err)
26 | 		}
27 | 	}
28 | 	rec := model.Recommendation{RecommendationListType: model.GENERAL, ExampleIds: []int{e1.Id, e2.Id, e3.Id}}
29 | 	err = repo.UpdateRecommendation(rec)
30 | 	if err != nil {
31 | 		t.Error(err)
32 | 	}
33 | 
34 | 	{
35 | 		rec, err := repo.FindRecommendation(model.GENERAL)
36 | 		if err != nil {
37 | 			t.Error(err)
38 | 		}
39 | 		if len(rec.ExampleIds) != 3 {
40 | 			t.Error("len(rec.ExampleIds) must be 3")
41 | 		}
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/lib/repository/related_example.go:
--------------------------------------------------------------------------------
 1 | package repository
 2 | 
 3 | import (
 4 | 	"github.com/lib/pq"
 5 | 	"github.com/syou6162/go-active-learning/lib/model"
 6 | )
 7 | 
 8 | func (r *repository) UpdateRelatedExamples(related model.RelatedExamples) error {
 9 | 	if _, err := r.db.Exec(`DELETE FROM related_example WHERE example_id = $1;`, related.ExampleId); err != nil {
10 | 		return err
11 | 	}
12 | 	if _, err := r.db.Exec(`INSERT INTO related_example (example_id, related_example_id) VALUES ($1, unnest(cast($2 AS INT[])));`, related.ExampleId, pq.Array(related.RelatedExampleIds)); err != nil {
13 | 		return err
14 | 	}
15 | 	return nil
16 | }
17 | 
18 | func (r *repository) FindRelatedExamples(e *model.Example) (*model.RelatedExamples, error) {
19 | 	related := &model.RelatedExamples{ExampleId: e.Id}
20 | 	items := make([]int, 0)
21 | 	query := `SELECT related_example_id FROM related_example WHERE example_id = $1;`
22 | 	err := r.db.Select(&items, query, e.Id)
23 | 	if err != nil {
24 | 		return nil, err
25 | 	}
26 | 	related.RelatedExampleIds = items
27 | 	return related, nil
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/repository/related_example_test.go:
--------------------------------------------------------------------------------
 1 | package repository_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/example"
 7 | 	"github.com/syou6162/go-active-learning/lib/model"
 8 | 	"github.com/syou6162/go-active-learning/lib/repository"
 9 | )
10 | 
11 | func TestUpdateRelatedExamples(t *testing.T) {
12 | 	repo, err := repository.New()
13 | 	if err != nil {
14 | 		t.Errorf(err.Error())
15 | 	}
16 | 	defer repo.Close()
17 | 
18 | 	e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
19 | 	e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
20 | 	e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
21 | 	examples := model.Examples{e1, e2, e3}
22 | 	for _, e := range examples {
23 | 		err = repo.UpdateOrCreateExample(e)
24 | 		if err != nil {
25 | 			t.Error(err)
26 | 		}
27 | 	}
28 | 	related := model.RelatedExamples{ExampleId: e1.Id, RelatedExampleIds: []int{e2.Id, e3.Id}}
29 | 	err = repo.UpdateRelatedExamples(related)
30 | 	if err != nil {
31 | 		t.Error(err)
32 | 	}
33 | 
34 | 	{
35 | 		related, err := repo.FindRelatedExamples(e1)
36 | 		if err != nil {
37 | 			t.Error(err)
38 | 		}
39 | 		if len(related.RelatedExampleIds) != 2 {
40 | 			t.Error("len(related.RelatedExampleIds) must be 2")
41 | 		}
42 | 	}
43 | 	{
44 | 		related, err := repo.FindRelatedExamples(e2)
45 | 		if err != nil {
46 | 			t.Error(err)
47 | 		}
48 | 		if len(related.RelatedExampleIds) != 0 {
49 | 			t.Error("len(related.RelatedExampleIds) must be 0")
50 | 		}
51 | 	}
52 | }
53 | 
54 | func TestUpdateRelatedExamplesMyOwn(t *testing.T) {
55 | 	repo, err := repository.New()
56 | 	if err != nil {
57 | 		t.Errorf(err.Error())
58 | 	}
59 | 	defer repo.Close()
60 | 
61 | 	e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
62 | 	e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
63 | 	e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
64 | 	examples := model.Examples{e1, e2, e3}
65 | 	for _, e := range examples {
66 | 		err = repo.UpdateOrCreateExample(e)
67 | 		if err != nil {
68 | 			t.Error(err)
69 | 		}
70 | 	}
71 | 	related := model.RelatedExamples{ExampleId: e1.Id, RelatedExampleIds: []int{e1.Id, e2.Id, e3.Id}}
72 | 	err = repo.UpdateRelatedExamples(related)
73 | 	if err == nil {
74 | 		t.Error("自身と同一のexample_idを持つ事例はrelated_example_idに追加できない")
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/lib/repository/repository.go:
--------------------------------------------------------------------------------
  1 | package repository
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"time"
  7 | 
  8 | 	"github.com/jmoiron/sqlx"
  9 | 
 10 | 	"bufio"
 11 | 
 12 | 	_ "github.com/lib/pq"
 13 | 	"github.com/syou6162/go-active-learning/lib/classifier"
 14 | 	"github.com/syou6162/go-active-learning/lib/feature"
 15 | 	"github.com/syou6162/go-active-learning/lib/model"
 16 | 	"github.com/syou6162/go-active-learning/lib/util"
 17 | )
 18 | 
 19 | type Repository interface {
 20 | 	UpdateOrCreateExample(e *model.Example) error
 21 | 	UpdateScore(e *model.Example) error
 22 | 	InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error)
 23 | 	InsertExamplesFromReader(reader io.Reader) error
 24 | 	SearchExamples() (model.Examples, error)
 25 | 	SearchRecentExamples(from time.Time, limit int) (model.Examples, error)
 26 | 	SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error)
 27 | 	SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error)
 28 | 	SearchLabeledExamples(limit int) (model.Examples, error)
 29 | 	SearchPositiveExamples(limit int) (model.Examples, error)
 30 | 	SearchNegativeExamples(limit int) (model.Examples, error)
 31 | 	SearchUnlabeledExamples(limit int) (model.Examples, error)
 32 | 	SearchPositiveScoredExamples(limit int) (model.Examples, error)
 33 | 	FindExampleByUlr(url string) (*model.Example, error)
 34 | 	FindExampleById(id int) (*model.Example, error)
 35 | 	SearchExamplesByUlrs(urls []string) (model.Examples, error)
 36 | 	SearchExamplesByIds(ids []int) (model.Examples, error)
 37 | 	SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error)
 38 | 	DeleteAllExamples() error
 39 | 
 40 | 	CountPositiveExamples() (int, error)
 41 | 	CountNegativeExamples() (int, error)
 42 | 	CountUnlabeledExamples() (int, error)
 43 | 
 44 | 	IncErrorCount(e *model.Example) error
 45 | 	GetErrorCount(e *model.Example) (int, error)
 46 | 
 47 | 	UpdateFeatureVector(e *model.Example) error
 48 | 	FindFeatureVector(e *model.Example) (feature.FeatureVector, error)
 49 | 	SearchFeatureVector(examples model.Examples) (map[int]feature.FeatureVector, error)
 50 | 
 51 | 	UpdateHatenaBookmark(e *model.Example) error
 52 | 	SearchHatenaBookmarks(examples model.Examples, limitForEachExample int) ([]*model.HatenaBookmark, error)
 53 | 	FindHatenaBookmark(e *model.Example, limit int) (*model.HatenaBookmark, error)
 54 | 
 55 | 	UpdateOrCreateReferringTweets(e *model.Example) error
 56 | 	UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error
 57 | 	SearchReferringTweetsList(examples model.Examples, limit int) (map[int]model.ReferringTweets, error)
 58 | 	SearchReferringTweets(limit int) (model.ReferringTweets, error)
 59 | 	SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
 60 | 	SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
 61 | 	SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
 62 | 	FindReferringTweets(e *model.Example, limit int) (model.ReferringTweets, error)
 63 | 	SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error)
 64 | 
 65 | 	InsertMIRAModel(m classifier.MIRAClassifier) error
 66 | 	FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error)
 67 | 
 68 | 	UpdateRecommendation(rec model.Recommendation) error
 69 | 	FindRecommendation(t model.RecommendationListType) (*model.Recommendation, error)
 70 | 
 71 | 	UpdateRelatedExamples(related model.RelatedExamples) error
 72 | 	FindRelatedExamples(e *model.Example) (*model.RelatedExamples, error)
 73 | 
 74 | 	UpdateTopAccessedExampleIds([]int) error
 75 | 	SearchTopAccessedExampleIds() ([]int, error)
 76 | 
 77 | 	Ping() error
 78 | 	Close() error
 79 | }
 80 | 
 81 | type repository struct {
 82 | 	db *sqlx.DB
 83 | }
 84 | 
 85 | func GetDataSourceName() string {
 86 | 	host := util.GetEnv("POSTGRES_HOST", "localhost")
 87 | 	dbUser := util.GetEnv("DB_USER", "nobody")
 88 | 	dbPassword := util.GetEnv("DB_PASSWORD", "nobody")
 89 | 	dbName := util.GetEnv("DB_NAME", "go-active-learning")
 90 | 	return fmt.Sprintf(
 91 | 		"host=%s user=%s password=%s dbname=%s sslmode=disable",
 92 | 		host, dbUser, dbPassword, dbName,
 93 | 	)
 94 | }
 95 | 
 96 | func New() (*repository, error) {
 97 | 	db, err := sqlx.Open("postgres", GetDataSourceName())
 98 | 	if err != nil {
 99 | 		return nil, err
100 | 	}
101 | 	db.SetMaxOpenConns(50)
102 | 	return &repository{db: db}, nil
103 | }
104 | 
105 | func (r *repository) Ping() error {
106 | 	return r.db.Ping()
107 | }
108 | 
109 | func (r *repository) Close() error {
110 | 	if r.db != nil {
111 | 		return r.db.Close()
112 | 	} else {
113 | 		return nil
114 | 	}
115 | }
116 | 


--------------------------------------------------------------------------------
/lib/repository/top_accessed_example.go:
--------------------------------------------------------------------------------
 1 | package repository
 2 | 
 3 | import (
 4 | 	"github.com/lib/pq"
 5 | )
 6 | 
 7 | func (r *repository) UpdateTopAccessedExampleIds(exampleIds []int) error {
 8 | 	if _, err := r.db.Exec(`DELETE FROM top_accessed_example;`); err != nil {
 9 | 		return err
10 | 	}
11 | 	if _, err := r.db.Exec(`INSERT INTO top_accessed_example (example_id) VALUES (unnest(cast($1 AS INT[])));`, pq.Array(exampleIds)); err != nil {
12 | 		return err
13 | 	}
14 | 	return nil
15 | }
16 | 
17 | func (r *repository) SearchTopAccessedExampleIds() ([]int, error) {
18 | 	exampleIds := make([]int, 0)
19 | 	query := `SELECT example_id FROM top_accessed_example;`
20 | 	err := r.db.Select(&exampleIds, query)
21 | 	if err != nil {
22 | 		return nil, err
23 | 	}
24 | 	return exampleIds, nil
25 | }
26 | 


--------------------------------------------------------------------------------
/lib/repository/top_accessed_example_test.go:
--------------------------------------------------------------------------------
 1 | package repository_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/example"
 7 | 	"github.com/syou6162/go-active-learning/lib/model"
 8 | 	"github.com/syou6162/go-active-learning/lib/repository"
 9 | )
10 | 
11 | func TestUpdateTopAccessedExampleIds(t *testing.T) {
12 | 	repo, err := repository.New()
13 | 	if err != nil {
14 | 		t.Errorf(err.Error())
15 | 	}
16 | 	defer repo.Close()
17 | 
18 | 	e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
19 | 	e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
20 | 	e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
21 | 	examples := model.Examples{e1, e2, e3}
22 | 	for _, e := range examples {
23 | 		err = repo.UpdateOrCreateExample(e)
24 | 		if err != nil {
25 | 			t.Error(err)
26 | 		}
27 | 	}
28 | 	exampleIds := make([]int, 0)
29 | 	for _, e := range examples {
30 | 		exampleIds = append(exampleIds, e.Id)
31 | 	}
32 | 	err = repo.UpdateTopAccessedExampleIds(exampleIds)
33 | 	if err != nil {
34 | 		t.Error(err)
35 | 	}
36 | 
37 | 	{
38 | 		top, err := repo.SearchTopAccessedExampleIds()
39 | 		if err != nil {
40 | 			t.Error(err)
41 | 		}
42 | 		if len(top) != 3 {
43 | 			t.Error("len(top) must be 3")
44 | 		}
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/lib/repository/tweet.go:
--------------------------------------------------------------------------------
  1 | package repository
  2 | 
  3 | import (
  4 | 	"time"
  5 | 
  6 | 	"github.com/lib/pq"
  7 | 	"github.com/syou6162/go-active-learning/lib/model"
  8 | )
  9 | 
 10 | func (r *repository) UpdateOrCreateReferringTweets(e *model.Example) error {
 11 | 	if e.ReferringTweets == nil || len((*e).ReferringTweets.Tweets) == 0 || (*e).ReferringTweets.Count == 0 {
 12 | 		return nil
 13 | 	}
 14 | 
 15 | 	tmp, err := r.FindExampleByUlr(e.Url)
 16 | 	if err != nil {
 17 | 		return err
 18 | 	}
 19 | 	id := tmp.Id
 20 | 
 21 | 	for _, t := range (*e).ReferringTweets.Tweets {
 22 | 		t.ExampleId = id
 23 | 		if _, err = r.db.NamedExec(`
 24 | INSERT INTO tweet
 25 | ( example_id,  created_at,  id_str,  full_text,  favorite_count,  retweet_count,  lang,  screen_name,  name,  profile_image_url,  label,  score)
 26 | VALUES
 27 | (:example_id, :created_at, :id_str, :full_text, :favorite_count, :retweet_count, :lang, :screen_name, :name, :profile_image_url, :label, :score)
 28 | ON CONFLICT (example_id, id_str)
 29 | DO UPDATE SET
 30 | favorite_count = :favorite_count,  retweet_count = :retweet_count, label = :label
 31 | WHERE
 32 | EXCLUDED.label != 0 AND tweet.label != EXCLUDED.label
 33 | ;`, t); err != nil {
 34 | 			return err
 35 | 		}
 36 | 	}
 37 | 	return nil
 38 | }
 39 | 
 40 | func (r *repository) UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error {
 41 | 	if _, err := r.db.Exec(`UPDATE tweet SET label = $1 WHERE example_id = $2 AND id_str = $3;`, label, exampleId, idStr); err != nil {
 42 | 		return err
 43 | 	}
 44 | 	return nil
 45 | }
 46 | 
 47 | type exampleIdWithTweetsCount struct {
 48 | 	ExampleId   int `db:"example_id"`
 49 | 	TweetsCount int `db:"tweets_count"`
 50 | }
 51 | 
 52 | func (r *repository) SearchReferringTweetsList(examples model.Examples, limitForEachExample int) (map[int]model.ReferringTweets, error) {
 53 | 	referringTweetsByExampleId := make(map[int]model.ReferringTweets)
 54 | 
 55 | 	exampleIds := make([]int, 0)
 56 | 	for _, e := range examples {
 57 | 		exampleIds = append(exampleIds, e.Id)
 58 | 	}
 59 | 
 60 | 	exampleIdsWithTweetsCount := make([]exampleIdWithTweetsCount, 0)
 61 | 	tweetsCountByExampleQuery := `SELECT example_id, COUNT(*) AS tweets_count FROM tweet WHERE example_id = ANY($1) GROUP BY example_id ORDER BY tweets_count DESC;`
 62 | 	err := r.db.Select(&exampleIdsWithTweetsCount, tweetsCountByExampleQuery, pq.Array(exampleIds))
 63 | 	if err != nil {
 64 | 		return referringTweetsByExampleId, err
 65 | 	}
 66 | 	tweetsCountByExampleId := make(map[int]int)
 67 | 	for _, e := range exampleIdsWithTweetsCount {
 68 | 		tweetsCountByExampleId[e.ExampleId] = e.TweetsCount
 69 | 	}
 70 | 
 71 | 	if limitForEachExample == 0 {
 72 | 		for _, exampleId := range exampleIds {
 73 | 			referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
 74 | 			if cnt, ok := tweetsCountByExampleId[exampleId]; ok {
 75 | 				referringTweets.Count = cnt
 76 | 			}
 77 | 			referringTweetsByExampleId[exampleId] = referringTweets
 78 | 		}
 79 | 		return referringTweetsByExampleId, nil
 80 | 	}
 81 | 
 82 | 	tweets := make([]*model.Tweet, 0)
 83 | 	query := `SELECT * FROM tweet WHERE example_id = ANY($1) AND label != -1 AND score > -1.0 AND (lang = 'en' OR lang = 'ja') ORDER BY favorite_count DESC LIMIT $2;`
 84 | 	err = r.db.Select(&tweets, query, pq.Array(exampleIds), limitForEachExample)
 85 | 	if err != nil {
 86 | 		return referringTweetsByExampleId, err
 87 | 	}
 88 | 	tweetsByExampleId := make(map[int][]*model.Tweet)
 89 | 	for _, t := range tweets {
 90 | 		tweetsByExampleId[t.ExampleId] = append(tweetsByExampleId[t.ExampleId], t)
 91 | 	}
 92 | 
 93 | 	for _, exampleId := range exampleIds {
 94 | 		referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
 95 | 		if tweets, ok := tweetsByExampleId[exampleId]; ok {
 96 | 			referringTweets.Tweets = tweets
 97 | 		}
 98 | 		if cnt, ok := tweetsCountByExampleId[exampleId]; ok {
 99 | 			referringTweets.Count = cnt
100 | 		}
101 | 		referringTweetsByExampleId[exampleId] = referringTweets
102 | 	}
103 | 	return referringTweetsByExampleId, nil
104 | }
105 | 
106 | func (r *repository) SearchReferringTweets(limit int) (model.ReferringTweets, error) {
107 | 	referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
108 | 	query := `SELECT * FROM tweet WHERE lang = 'en' OR lang = 'ja' ORDER BY created_at DESC LIMIT $1;`
109 | 	err := r.db.Select(&referringTweets.Tweets, query, limit)
110 | 	if err != nil {
111 | 		return referringTweets, err
112 | 	}
113 | 	referringTweets.Count = len(referringTweets.Tweets)
114 | 	return referringTweets, nil
115 | }
116 | 
117 | func (r *repository) SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) {
118 | 	referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
119 | 	query := `
120 | SELECT 
121 | 	tweet.id,
122 | 	tweet.example_id,
123 | 
124 | 	tweet.created_at,
125 | 	tweet.id_str,
126 | 	tweet.full_text,
127 | 	tweet.favorite_count,
128 | 	tweet.retweet_count,
129 | 	tweet.lang,
130 | 
131 | 	tweet.screen_name,
132 | 	tweet.name,
133 | 	tweet.profile_image_url,
134 | 	tweet.label,
135 | 	tweet.score
136 | FROM 
137 | 	tweet 
138 | INNER JOIN 
139 | 	example ON example.id = example_id 
140 | WHERE
141 | 	tweet.created_at > $1 AND 
142 | 	tweet.label != -1 AND 
143 | 	example.label != -1 AND 
144 | 	tweet.score > $2 AND 
145 | 	(favorite_count > 0 OR retweet_count > 0) AND
146 | 	(lang = 'en' OR lang = 'ja')
147 | ORDER BY tweet.score DESC
148 | LIMIT $3
149 | ;
150 | `
151 | 	err := r.db.Select(&referringTweets.Tweets, query, from, scoreThreshold, limit)
152 | 	if err != nil {
153 | 		return referringTweets, err
154 | 	}
155 | 	referringTweets.Count = len(referringTweets.Tweets)
156 | 	return referringTweets, nil
157 | }
158 | 
159 | func (r *repository) searchReferringTweetsByLabel(label model.LabelType, scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
160 | 	referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
161 | 	query := `
162 | WITH t AS (
163 |   SELECT
164 |     id,
165 |     ROW_NUMBER() OVER(partition BY example_id ORDER BY favorite_count DESC) AS rank_example_id,
166 |     ROW_NUMBER() OVER(partition BY id_str ORDER BY favorite_count DESC) AS rank_id_str
167 |   FROM
168 |     tweet
169 |   WHERE
170 |     example_id IN (SELECT id FROM example WHERE label != -1 AND updated_at > NOW() - INTERVAL '30 DAYS')
171 |     AND label = $1 AND (lang = 'en' OR lang = 'ja') AND score > $2
172 | )
173 | 
174 | SELECT
175 |   *
176 | FROM
177 |   tweet
178 | WHERE
179 |   id IN (SELECT id FROM t WHERE rank_example_id <= $3 AND rank_id_str = 1)
180 | ORDER BY
181 |   created_at DESC
182 | LIMIT $4
183 | ;`
184 | 	err := r.db.Select(&referringTweets.Tweets, query, label, scoreThreshold, tweetsLimitInSameExample, limit)
185 | 	if err != nil {
186 | 		return referringTweets, err
187 | 	}
188 | 	referringTweets.Count = len(referringTweets.Tweets)
189 | 	return referringTweets, nil
190 | }
191 | 
192 | func (r *repository) SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
193 | 	return r.searchReferringTweetsByLabel(model.POSITIVE, scoreThreshold, tweetsLimitInSameExample, limit)
194 | }
195 | 
196 | func (r *repository) SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
197 | 	return r.searchReferringTweetsByLabel(model.NEGATIVE, scoreThreshold, tweetsLimitInSameExample, limit)
198 | }
199 | 
200 | func (r *repository) SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
201 | 	return r.searchReferringTweetsByLabel(model.UNLABELED, scoreThreshold, tweetsLimitInSameExample, limit)
202 | }
203 | 
204 | type tweetsCount struct {
205 | 	Count int `db:"count"`
206 | }
207 | 
208 | func (r *repository) FindReferringTweets(e *model.Example, limit int) (model.ReferringTweets, error) {
209 | 	referringTweets := model.ReferringTweets{Count: 0, Tweets: make([]*model.Tweet, 0)}
210 | 
211 | 	countQuery := `SELECT COUNT(*) AS count FROM tweet WHERE example_id = $1;`
212 | 	cnt := tweetsCount{}
213 | 	err := r.db.Get(&cnt, countQuery, e.Id)
214 | 	if err != nil {
215 | 		return referringTweets, err
216 | 	}
217 | 	referringTweets.Count = cnt.Count
218 | 	if limit == 0 {
219 | 		return referringTweets, err
220 | 	}
221 | 
222 | 	query := `SELECT * FROM tweet WHERE example_id = $1 AND label != -1 AND score > 0.0 AND (lang = 'en' OR lang = 'ja') ORDER BY favorite_count DESC LIMIT $2;`
223 | 	err = r.db.Select(&referringTweets.Tweets, query, e.Id, limit)
224 | 	if err != nil {
225 | 		return referringTweets, err
226 | 	}
227 | 	return referringTweets, nil
228 | }
229 | 


--------------------------------------------------------------------------------
/lib/repository/tweet_test.go:
--------------------------------------------------------------------------------
  1 | package repository_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | 
  7 | 	"github.com/syou6162/go-active-learning/lib/example"
  8 | 	"github.com/syou6162/go-active-learning/lib/model"
  9 | 	"github.com/syou6162/go-active-learning/lib/repository"
 10 | )
 11 | 
 12 | func TestUpdateReferringTweets(t *testing.T) {
 13 | 	repo, err := repository.New()
 14 | 	if err != nil {
 15 | 		t.Errorf(err.Error())
 16 | 	}
 17 | 	defer repo.Close()
 18 | 
 19 | 	if err = repo.DeleteAllExamples(); err != nil {
 20 | 		t.Error(err)
 21 | 	}
 22 | 
 23 | 	e := example.NewExample("http://hoge.com", model.UNLABELED)
 24 | 	err = repo.UpdateOrCreateExample(e)
 25 | 	if err != nil {
 26 | 		t.Error(err)
 27 | 	}
 28 | 	now := time.Now()
 29 | 	idStr := "1111111"
 30 | 	t1 := model.Tweet{
 31 | 		CreatedAt:       now,
 32 | 		IdStr:           idStr,
 33 | 		FullText:        "hello world!!!",
 34 | 		FavoriteCount:   10,
 35 | 		RetweetCount:    10,
 36 | 		Lang:            "en",
 37 | 		ScreenName:      "syou6162",
 38 | 		Name:            "syou6162",
 39 | 		ProfileImageUrl: "http://hogehoge.com/profile.png",
 40 | 		Score:           1.0,
 41 | 	}
 42 | 
 43 | 	tweets := model.ReferringTweets{}
 44 | 	tweets.Tweets = append(tweets.Tweets, &t1)
 45 | 	tweets.Count = len(tweets.Tweets)
 46 | 	e.ReferringTweets = &tweets
 47 | 	if err = repo.UpdateOrCreateReferringTweets(e); err != nil {
 48 | 		t.Error(err)
 49 | 	}
 50 | 
 51 | 	{
 52 | 		result, err := repo.SearchReferringTweetsList(model.Examples{e}, 10)
 53 | 		if err != nil {
 54 | 			t.Error(err)
 55 | 		}
 56 | 		if len(result) == 0 {
 57 | 			t.Error("result must not be empty")
 58 | 		}
 59 | 		if len(result[e.Id].Tweets) == 0 {
 60 | 			t.Error("result must not be empty")
 61 | 		}
 62 | 		if result[e.Id].Count == 0 {
 63 | 			t.Error("result must not be zero")
 64 | 		}
 65 | 		if result[e.Id].Tweets[0].Name != "syou6162" {
 66 | 			t.Error("Name must be syou6162")
 67 | 		}
 68 | 	}
 69 | 
 70 | 	{
 71 | 		result, err := repo.FindReferringTweets(e, 10)
 72 | 		if err != nil {
 73 | 			t.Error(err)
 74 | 		}
 75 | 		if len(result.Tweets) == 0 {
 76 | 			t.Error("result must not be empty")
 77 | 		}
 78 | 		if result.Count == 0 {
 79 | 			t.Error("result must not be empty")
 80 | 		}
 81 | 		if result.Tweets[0].Name != "syou6162" {
 82 | 			t.Error("Name must be syou6162")
 83 | 		}
 84 | 	}
 85 | 
 86 | 	{
 87 | 		result, err := repo.FindReferringTweets(e, 0)
 88 | 		if err != nil {
 89 | 			t.Error(err)
 90 | 		}
 91 | 		if len(result.Tweets) != 0 {
 92 | 			t.Error("result must be empty")
 93 | 		}
 94 | 		if result.Count == 0 {
 95 | 			t.Error("result must not be empty")
 96 | 		}
 97 | 	}
 98 | 
 99 | 	{
100 | 		if err := repo.UpdateTweetLabel(e.Id, idStr, model.NEGATIVE); err != nil {
101 | 			t.Error(err)
102 | 		}
103 | 		result, err := repo.FindReferringTweets(e, 10)
104 | 		if err != nil {
105 | 			t.Error(err)
106 | 		}
107 | 		if len(result.Tweets) != 0 {
108 | 			t.Error("result must be empty")
109 | 		}
110 | 		if result.Count != 1 {
111 | 			t.Error("result must be 1")
112 | 		}
113 | 	}
114 | }
115 | 
116 | func TestSearchReferringTweetsByLabel(t *testing.T) {
117 | 	repo, err := repository.New()
118 | 	if err != nil {
119 | 		t.Errorf(err.Error())
120 | 	}
121 | 	defer repo.Close()
122 | 
123 | 	if err = repo.DeleteAllExamples(); err != nil {
124 | 		t.Error(err)
125 | 	}
126 | 
127 | 	e := example.NewExample("http://hoge.com", model.UNLABELED)
128 | 	err = repo.UpdateOrCreateExample(e)
129 | 	if err != nil {
130 | 		t.Error(err)
131 | 	}
132 | 	now := time.Now()
133 | 	idStr := "1111111"
134 | 	t1 := model.Tweet{
135 | 		CreatedAt:       now,
136 | 		IdStr:           idStr,
137 | 		FullText:        "hello world!!!",
138 | 		FavoriteCount:   10,
139 | 		RetweetCount:    10,
140 | 		Lang:            "en",
141 | 		ScreenName:      "syou6162",
142 | 		Name:            "syou6162",
143 | 		ProfileImageUrl: "http://hogehoge.com/profile.png",
144 | 		Label:           model.POSITIVE,
145 | 	}
146 | 
147 | 	tweets := model.ReferringTweets{}
148 | 	tweets.Tweets = append(tweets.Tweets, &t1)
149 | 	tweets.Count = len(tweets.Tweets)
150 | 	e.ReferringTweets = &tweets
151 | 	if err = repo.UpdateOrCreateReferringTweets(e); err != nil {
152 | 		t.Error(err)
153 | 	}
154 | 
155 | 	limit := 10
156 | 	{
157 | 		result, err := repo.SearchPositiveReferringTweets(-1.0, 3, limit)
158 | 		if err != nil {
159 | 			t.Error(err)
160 | 		}
161 | 		if len(result.Tweets) != 1 {
162 | 			t.Error("len(result) must be 1")
163 | 		}
164 | 		if result.Count != 1 {
165 | 			t.Error("Count must be 1")
166 | 		}
167 | 	}
168 | 	{
169 | 		result, err := repo.SearchNegativeReferringTweets(-1.0, 3, limit)
170 | 		if err != nil {
171 | 			t.Error(err)
172 | 		}
173 | 		if len(result.Tweets) != 0 {
174 | 			t.Error("len(result) must be empty")
175 | 		}
176 | 		if result.Count != 0 {
177 | 			t.Error("Count must be zero")
178 | 		}
179 | 	}
180 | }
181 | 
182 | func TestSearchRecentReferringTweetsWithHighScore(t *testing.T) {
183 | 	repo, err := repository.New()
184 | 	if err != nil {
185 | 		t.Errorf(err.Error())
186 | 	}
187 | 	defer repo.Close()
188 | 
189 | 	if err = repo.DeleteAllExamples(); err != nil {
190 | 		t.Error(err)
191 | 	}
192 | 
193 | 	e := example.NewExample("http://hoge.com", model.UNLABELED)
194 | 	err = repo.UpdateOrCreateExample(e)
195 | 	if err != nil {
196 | 		t.Error(err)
197 | 	}
198 | 	now := time.Now()
199 | 	t1 := model.Tweet{
200 | 		CreatedAt:       now,
201 | 		IdStr:           "1111111",
202 | 		FullText:        "hello world!!!",
203 | 		FavoriteCount:   10,
204 | 		RetweetCount:    10,
205 | 		Lang:            "en",
206 | 		ScreenName:      "syou6162",
207 | 		Name:            "syou6162",
208 | 		ProfileImageUrl: "http://hogehoge.com/profile.png",
209 | 		Label:           model.POSITIVE,
210 | 		Score:           10.0,
211 | 	}
212 | 	t2 := model.Tweet{
213 | 		CreatedAt:       now,
214 | 		IdStr:           "22222222",
215 | 		FullText:        "hello world!!!",
216 | 		FavoriteCount:   10,
217 | 		RetweetCount:    10,
218 | 		Lang:            "en",
219 | 		ScreenName:      "syou6162",
220 | 		Name:            "syou6162",
221 | 		ProfileImageUrl: "http://hogehoge.com/profile.png",
222 | 		Label:           model.POSITIVE,
223 | 		Score:           10.0,
224 | 	}
225 | 	t3 := model.Tweet{
226 | 		CreatedAt:       now,
227 | 		IdStr:           "3333333333",
228 | 		FullText:        "hello world!!!",
229 | 		FavoriteCount:   10,
230 | 		RetweetCount:    10,
231 | 		Lang:            "en",
232 | 		ScreenName:      "syou6162",
233 | 		Name:            "syou6162",
234 | 		ProfileImageUrl: "http://hogehoge.com/profile.png",
235 | 		Label:           model.POSITIVE,
236 | 		Score:           -10.0,
237 | 	}
238 | 
239 | 	tweets := model.ReferringTweets{}
240 | 	tweets.Tweets = append(tweets.Tweets, &t1, &t2, &t3)
241 | 	tweets.Count = len(tweets.Tweets)
242 | 	e.ReferringTweets = &tweets
243 | 	if err = repo.UpdateOrCreateReferringTweets(e); err != nil {
244 | 		t.Error(err)
245 | 	}
246 | 
247 | 	limit := 10
248 | 	{
249 | 		result, err := repo.SearchRecentReferringTweetsWithHighScore(now.Add(time.Duration(-10*24)*time.Hour), 0.0, limit)
250 | 		if err != nil {
251 | 			t.Error(err)
252 | 		}
253 | 		if len(result.Tweets) != 2 {
254 | 			t.Error("len(result) must be 2")
255 | 		}
256 | 		if result.Count != 2 {
257 | 			t.Error("Count must be 2")
258 | 		}
259 | 	}
260 | }
261 | 


--------------------------------------------------------------------------------
/lib/service/example.go:
--------------------------------------------------------------------------------
  1 | package service
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"log"
  8 | 	"net/http"
  9 | 	"runtime"
 10 | 	"time"
 11 | 
 12 | 	"math"
 13 | 	"os"
 14 | 	"strconv"
 15 | 	"sync"
 16 | 
 17 | 	"github.com/syou6162/go-active-learning/lib/example"
 18 | 	"github.com/syou6162/go-active-learning/lib/fetcher"
 19 | 	"github.com/syou6162/go-active-learning/lib/model"
 20 | 	"github.com/syou6162/go-active-learning/lib/util"
 21 | )
 22 | 
 23 | func (app *goActiveLearningApp) UpdateOrCreateExample(e *model.Example) error {
 24 | 	return app.repo.UpdateOrCreateExample(e)
 25 | }
 26 | 
 27 | func (app *goActiveLearningApp) UpdateScore(e *model.Example) error {
 28 | 	return app.repo.UpdateScore(e)
 29 | }
 30 | 
 31 | func (app *goActiveLearningApp) InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error) {
 32 | 	return app.repo.InsertExampleFromScanner(scanner)
 33 | }
 34 | 
 35 | func (app *goActiveLearningApp) InsertExamplesFromReader(reader io.Reader) error {
 36 | 	return app.repo.InsertExamplesFromReader(reader)
 37 | }
 38 | 
 39 | func (app *goActiveLearningApp) SearchExamples() (model.Examples, error) {
 40 | 	return app.repo.SearchExamples()
 41 | }
 42 | 
 43 | func (app *goActiveLearningApp) SearchRecentExamples(from time.Time, limit int) (model.Examples, error) {
 44 | 	return app.repo.SearchRecentExamples(from, limit)
 45 | }
 46 | 
 47 | func (app *goActiveLearningApp) SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error) {
 48 | 	return app.repo.SearchRecentExamplesByHost(host, from, limit)
 49 | }
 50 | 
 51 | func (app *goActiveLearningApp) SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error) {
 52 | 	return app.repo.SearchExamplesByLabel(label, limit)
 53 | }
 54 | 
 55 | func (app *goActiveLearningApp) SearchLabeledExamples(limit int) (model.Examples, error) {
 56 | 	return app.repo.SearchLabeledExamples(limit)
 57 | }
 58 | 
 59 | func (app *goActiveLearningApp) SearchPositiveExamples(limit int) (model.Examples, error) {
 60 | 	return app.repo.SearchPositiveExamples(limit)
 61 | }
 62 | 
 63 | func (app *goActiveLearningApp) SearchNegativeExamples(limit int) (model.Examples, error) {
 64 | 	return app.repo.SearchNegativeExamples(limit)
 65 | }
 66 | 
 67 | func (app *goActiveLearningApp) SearchUnlabeledExamples(limit int) (model.Examples, error) {
 68 | 	return app.repo.SearchUnlabeledExamples(limit)
 69 | }
 70 | 
 71 | func (app *goActiveLearningApp) SearchPositiveScoredExamples(limit int) (model.Examples, error) {
 72 | 	return app.repo.SearchPositiveScoredExamples(limit)
 73 | }
 74 | 
 75 | func (app *goActiveLearningApp) FindExampleByUlr(url string) (*model.Example, error) {
 76 | 	return app.repo.FindExampleByUlr(url)
 77 | }
 78 | 
 79 | func (app *goActiveLearningApp) FindExampleById(id int) (*model.Example, error) {
 80 | 	return app.repo.FindExampleById(id)
 81 | }
 82 | 
 83 | func (app *goActiveLearningApp) SearchExamplesByUlrs(urls []string) (model.Examples, error) {
 84 | 	return app.repo.SearchExamplesByUlrs(urls)
 85 | }
 86 | 
 87 | func (app *goActiveLearningApp) SearchExamplesByIds(ids []int) (model.Examples, error) {
 88 | 	return app.repo.SearchExamplesByIds(ids)
 89 | }
 90 | 
 91 | func (app *goActiveLearningApp) SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error) {
 92 | 	return app.repo.SearchExamplesByKeywords(keywords, aggregator, limit)
 93 | }
 94 | 
 95 | func (app *goActiveLearningApp) DeleteAllExamples() error {
 96 | 	return app.repo.DeleteAllExamples()
 97 | }
 98 | 
 99 | func (app *goActiveLearningApp) CountPositiveExamples() (int, error) {
100 | 	return app.repo.CountPositiveExamples()
101 | }
102 | 
103 | func (app *goActiveLearningApp) CountNegativeExamples() (int, error) {
104 | 	return app.repo.CountNegativeExamples()
105 | }
106 | 
107 | func (app *goActiveLearningApp) CountUnlabeledExamples() (int, error) {
108 | 	return app.repo.CountUnlabeledExamples()
109 | }
110 | 
111 | func (app *goActiveLearningApp) UpdateFeatureVector(e *model.Example) error {
112 | 	return app.repo.UpdateFeatureVector(e)
113 | }
114 | 
115 | func (app *goActiveLearningApp) UpdateHatenaBookmark(e *model.Example) error {
116 | 	return app.repo.UpdateHatenaBookmark(e)
117 | }
118 | 
119 | func (app *goActiveLearningApp) UpdateOrCreateReferringTweets(e *model.Example) error {
120 | 	return app.repo.UpdateOrCreateReferringTweets(e)
121 | }
122 | 
123 | func (app *goActiveLearningApp) UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error {
124 | 	return app.repo.UpdateTweetLabel(exampleId, idStr, label)
125 | }
126 | 
127 | func (app *goActiveLearningApp) SearchReferringTweets(limit int) (model.ReferringTweets, error) {
128 | 	return app.repo.SearchReferringTweets(limit)
129 | }
130 | 
131 | func (app *goActiveLearningApp) SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
132 | 	return app.repo.SearchPositiveReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit)
133 | }
134 | 
135 | func (app *goActiveLearningApp) SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
136 | 	return app.repo.SearchNegativeReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit)
137 | }
138 | 
139 | func (app *goActiveLearningApp) SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error) {
140 | 	return app.repo.SearchUnlabeledReferringTweets(scoreThreshold, tweetsLimitInSameExample, limit)
141 | }
142 | 
143 | func (app *goActiveLearningApp) SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error) {
144 | 	return app.repo.SearchRecentReferringTweetsWithHighScore(from, scoreThreshold, limit)
145 | }
146 | 
147 | func hatenaBookmarkByExampleId(hatenaBookmarks []*model.HatenaBookmark) map[int]*model.HatenaBookmark {
148 | 	result := make(map[int]*model.HatenaBookmark)
149 | 	for _, hb := range hatenaBookmarks {
150 | 		result[hb.ExampleId] = hb
151 | 	}
152 | 	return result
153 | }
154 | 
155 | func (app *goActiveLearningApp) AttachMetadataIncludingFeatureVector(examples model.Examples, bookmarkLimit int, tweetLimit int) error {
156 | 	// make sure that example id must be filled
157 | 	for _, e := range examples {
158 | 		if e.Id == 0 {
159 | 			tmp, err := app.FindExampleByUlr(e.Url)
160 | 			if err != nil {
161 | 				return err
162 | 			}
163 | 			e.Id = tmp.Id
164 | 		}
165 | 	}
166 | 
167 | 	fvList, err := app.repo.SearchFeatureVector(examples)
168 | 	if err != nil {
169 | 		return err
170 | 	}
171 | 
172 | 	for _, e := range examples {
173 | 		if fv, ok := fvList[e.Id]; ok {
174 | 			e.Fv = fv
175 | 		}
176 | 	}
177 | 
178 | 	return app.AttachMetadata(examples, bookmarkLimit, tweetLimit)
179 | }
180 | 
181 | func (app *goActiveLearningApp) AttachMetadata(examples model.Examples, bookmarkLimit int, tweetLimit int) error {
182 | 	hatenaBookmarks, err := app.repo.SearchHatenaBookmarks(examples, bookmarkLimit)
183 | 	if err != nil {
184 | 		return err
185 | 	}
186 | 	hbByid := hatenaBookmarkByExampleId(hatenaBookmarks)
187 | 	for _, e := range examples {
188 | 		if b, ok := hbByid[e.Id]; ok {
189 | 			e.HatenaBookmark = b
190 | 		} else {
191 | 			e.HatenaBookmark = &model.HatenaBookmark{Bookmarks: []*model.Bookmark{}}
192 | 		}
193 | 	}
194 | 
195 | 	referringTweetsById, err := app.repo.SearchReferringTweetsList(examples, tweetLimit)
196 | 	if err != nil {
197 | 		return err
198 | 	}
199 | 	for _, e := range examples {
200 | 		if t, ok := referringTweetsById[e.Id]; ok {
201 | 			e.ReferringTweets = &t
202 | 		} else {
203 | 			e.ReferringTweets = &model.ReferringTweets{}
204 | 		}
205 | 	}
206 | 	return nil
207 | }
208 | 
209 | func (app *goActiveLearningApp) UpdateRelatedExamples(related model.RelatedExamples) error {
210 | 	return app.repo.UpdateRelatedExamples(related)
211 | }
212 | 
213 | func (app *goActiveLearningApp) SearchRelatedExamples(e *model.Example) (model.Examples, error) {
214 | 	related, err := app.repo.FindRelatedExamples(e)
215 | 	if err != nil {
216 | 		return nil, err
217 | 	}
218 | 	return app.repo.SearchExamplesByIds(related.RelatedExampleIds)
219 | }
220 | 
221 | func (app *goActiveLearningApp) UpdateTopAccessedExampleIds(exampleIds []int) error {
222 | 	return app.repo.UpdateTopAccessedExampleIds(exampleIds)
223 | }
224 | 
225 | func (app *goActiveLearningApp) SearchTopAccessedExamples() (model.Examples, error) {
226 | 	exampleIds, err := app.repo.SearchTopAccessedExampleIds()
227 | 	if err != nil {
228 | 		return nil, err
229 | 	}
230 | 	return app.repo.SearchExamplesByIds(exampleIds)
231 | }
232 | 
233 | func (app *goActiveLearningApp) UpdateRecommendation(listName string, examples model.Examples) error {
234 | 	listType, err := model.GetRecommendationListType(listName)
235 | 	if err != nil {
236 | 		return err
237 | 	}
238 | 
239 | 	exampleIds := make([]int, 0)
240 | 	for _, e := range examples {
241 | 		exampleIds = append(exampleIds, e.Id)
242 | 	}
243 | 
244 | 	rec := model.Recommendation{RecommendationListType: listType, ExampleIds: exampleIds}
245 | 	return app.repo.UpdateRecommendation(rec)
246 | }
247 | 
248 | func (app *goActiveLearningApp) GetRecommendation(listName string) (model.Examples, error) {
249 | 	listType, err := model.GetRecommendationListType(listName)
250 | 	if err != nil {
251 | 		return nil, err
252 | 	}
253 | 	rec, err := app.repo.FindRecommendation(listType)
254 | 	return app.repo.SearchExamplesByIds(rec.ExampleIds)
255 | }
256 | 
257 | func (app *goActiveLearningApp) splitExamplesByStatusOK(examples model.Examples) (model.Examples, model.Examples, error) {
258 | 	urls := make([]string, 0)
259 | 	exampleByurl := make(map[string]*model.Example)
260 | 	for _, e := range examples {
261 | 		exampleByurl[e.Url] = e
262 | 		urls = append(urls, e.Url)
263 | 	}
264 | 	tmpExamples, err := app.SearchExamplesByUlrs(urls)
265 | 	if err != nil {
266 | 		return nil, nil, err
267 | 	}
268 | 
269 | 	examplesWithMetaData := model.Examples{}
270 | 	examplesWithEmptyMetaData := model.Examples{}
271 | 	for _, e := range tmpExamples {
272 | 		if e.StatusCode == http.StatusOK {
273 | 			examplesWithMetaData = append(examplesWithMetaData, exampleByurl[e.Url])
274 | 			delete(exampleByurl, e.Url)
275 | 		} else {
276 | 			examplesWithEmptyMetaData = append(examplesWithEmptyMetaData, exampleByurl[e.Url])
277 | 			delete(exampleByurl, e.Url)
278 | 		}
279 | 	}
280 | 	for _, e := range exampleByurl {
281 | 		examplesWithEmptyMetaData = append(examplesWithEmptyMetaData, e)
282 | 	}
283 | 	return examplesWithMetaData, examplesWithEmptyMetaData, nil
284 | }
285 | 
286 | func fetchMetaData(e *model.Example) error {
287 | 	article, err := fetcher.GetArticle(e.Url)
288 | 	if err != nil {
289 | 		return err
290 | 	}
291 | 
292 | 	e.Title = article.Title
293 | 	e.FinalUrl = article.Url
294 | 	e.Description = article.Description
295 | 	e.OgDescription = article.OgDescription
296 | 	e.OgType = article.OgType
297 | 	e.OgImage = article.OgImage
298 | 	e.Body = article.Body
299 | 	e.StatusCode = article.StatusCode
300 | 	e.Favicon = article.Favicon
301 | 
302 | 	now := time.Now()
303 | 	tooOldDate := time.Date(2000, time.January, 1, 1, 1, 0, 0, time.UTC)
304 | 	if article.PublishDate != nil && (now.After(*article.PublishDate) || tooOldDate.Before(*article.PublishDate)) {
305 | 		e.CreatedAt = *article.PublishDate
306 | 		e.UpdatedAt = *article.PublishDate
307 | 	}
308 | 
309 | 	fv := util.RemoveDuplicate(example.ExtractFeatures(*e))
310 | 	if len(fv) > 100000 {
311 | 		return fmt.Errorf("too large features (N = %d) for %s", len(fv), e.FinalUrl)
312 | 	}
313 | 	e.Fv = fv
314 | 
315 | 	return nil
316 | }
317 | 
318 | func (app *goActiveLearningApp) Fetch(examples model.Examples) {
319 | 	batchSize := 100
320 | 	examplesList := make([]model.Examples, 0)
321 | 	n := len(examples)
322 | 
323 | 	for i := 0; i < n; i += batchSize {
324 | 		max := int(math.Min(float64(i+batchSize), float64(n)))
325 | 		examplesList = append(examplesList, examples[i:max])
326 | 	}
327 | 	for _, l := range examplesList {
328 | 		examplesWithMetaData, examplesWithEmptyMetaData, err := app.splitExamplesByStatusOK(l)
329 | 		if err != nil {
330 | 			log.Println(err.Error())
331 | 		}
332 | 		// ToDo: 本当に必要か考える
333 | 		app.AttachMetadataIncludingFeatureVector(examplesWithMetaData, 0, 0)
334 | 
335 | 		wg := &sync.WaitGroup{}
336 | 		cpus := runtime.NumCPU()
337 | 		runtime.GOMAXPROCS(cpus)
338 | 		sem := make(chan struct{}, batchSize)
339 | 		for idx, e := range examplesWithEmptyMetaData {
340 | 			wg.Add(1)
341 | 			sem <- struct{}{}
342 | 			go func(e *model.Example, idx int) {
343 | 				defer wg.Done()
344 | 				cnt, err := app.repo.GetErrorCount(e)
345 | 				if err != nil {
346 | 					log.Println(err.Error())
347 | 				}
348 | 				if cnt < 5 {
349 | 					fmt.Fprintln(os.Stderr, "Fetching("+strconv.Itoa(idx)+"): "+e.Url)
350 | 					if err := fetchMetaData(e); err != nil {
351 | 						app.repo.IncErrorCount(e)
352 | 						log.Println(err.Error())
353 | 					}
354 | 				}
355 | 				<-sem
356 | 			}(e, idx)
357 | 		}
358 | 		wg.Wait()
359 | 	}
360 | }
361 | 


--------------------------------------------------------------------------------
/lib/service/example_test.go:
--------------------------------------------------------------------------------
  1 | package service_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/syou6162/go-active-learning/lib/example"
  7 | 	"github.com/syou6162/go-active-learning/lib/model"
  8 | 	"github.com/syou6162/go-active-learning/lib/service"
  9 | )
 10 | 
 11 | func findExampleByurl(examples model.Examples, url string) *model.Example {
 12 | 	for _, e := range examples {
 13 | 		if e.Url == url {
 14 | 			return e
 15 | 		}
 16 | 	}
 17 | 	return nil
 18 | }
 19 | 
 20 | func TestAttachMetaData(t *testing.T) {
 21 | 	app, err := service.NewDefaultApp()
 22 | 	if err != nil {
 23 | 		t.Error(err)
 24 | 	}
 25 | 	defer app.Close()
 26 | 	if err := app.DeleteAllExamples(); err != nil {
 27 | 		t.Error("Cannot delete examples")
 28 | 	}
 29 | 
 30 | 	hatebuUrl := "https://b.hatena.ne.jp"
 31 | 	myBlogUrl := "https://www.yasuhisay.info"
 32 | 	githubUrl := "https://github.com"
 33 | 	e1 := example.NewExample(hatebuUrl, model.POSITIVE)
 34 | 	e2 := example.NewExample(myBlogUrl, model.NEGATIVE)
 35 | 	e3 := example.NewExample(githubUrl, model.UNLABELED)
 36 | 	examples := model.Examples{e1, e2, e3}
 37 | 
 38 | 	hatebu := findExampleByurl(examples, hatebuUrl)
 39 | 	if hatebu == nil {
 40 | 		t.Errorf("Cannot find %s", hatebuUrl)
 41 | 	}
 42 | 	if hatebu.Title != "" {
 43 | 		t.Errorf("Title must be empty for %s", hatebu.Url)
 44 | 	}
 45 | 	if len(hatebu.Fv) != 0 {
 46 | 		t.Errorf("Feature vector must be empty for %s", hatebu.Url)
 47 | 	}
 48 | 	app.AttachMetadataIncludingFeatureVector(examples, 10, 10)
 49 | 
 50 | 	if hatebu.Title != "" {
 51 | 		t.Errorf("Title must be empty for %s", hatebu.Url)
 52 | 	}
 53 | 	if len(hatebu.Fv) != 0 {
 54 | 		t.Errorf("Feature vector must be empty for %s", hatebu.Url)
 55 | 	}
 56 | 
 57 | 	myBlog := findExampleByurl(examples, myBlogUrl)
 58 | 	if myBlog == nil {
 59 | 		t.Errorf("Cannot find %s", myBlogUrl)
 60 | 	}
 61 | 	if myBlog.OgType != "" {
 62 | 		t.Errorf("OgType must be empty for %s", myBlog.Url)
 63 | 	}
 64 | 
 65 | 	app.Fetch(examples)
 66 | 	for _, e := range examples {
 67 | 		err = app.UpdateOrCreateExample(e)
 68 | 		if err != nil {
 69 | 			t.Error(err)
 70 | 		}
 71 | 		err = app.UpdateFeatureVector(e)
 72 | 		if err != nil {
 73 | 			t.Error(err)
 74 | 		}
 75 | 	}
 76 | 	if hatebu.Title == "" {
 77 | 		t.Errorf("Title must not be empty for %s", hatebu.Url)
 78 | 	}
 79 | 	if len(hatebu.Fv) == 0 {
 80 | 		t.Errorf("Feature vector must not be empty for %s", hatebu.Url)
 81 | 	}
 82 | 
 83 | 	if myBlog.OgType != "blog" {
 84 | 		t.Errorf("OgType must be blog for %s", myBlog.Url)
 85 | 	}
 86 | 
 87 | 	examples, err = app.SearchExamplesByIds([]int{e1.Id, e2.Id, e3.Id})
 88 | 	if err != nil {
 89 | 		t.Error(err)
 90 | 	}
 91 | 	err = app.AttachMetadataIncludingFeatureVector(examples, 10, 10)
 92 | 	if err != nil {
 93 | 		t.Error(err)
 94 | 	}
 95 | 
 96 | 	if hatebu.Title == "" {
 97 | 		t.Errorf("Title must be empty for %s", hatebu.Url)
 98 | 	}
 99 | 	if len(hatebu.Fv) == 0 {
100 | 		t.Errorf("Feature vector must not be empty for %s", hatebu.Url)
101 | 	}
102 | 
103 | 	if myBlog.OgType != "blog" {
104 | 		t.Errorf("OgType must be blog for %s", myBlog.Url)
105 | 	}
106 | }
107 | 
108 | func TestGetRecommendation(t *testing.T) {
109 | 	app, err := service.NewDefaultApp()
110 | 	if err != nil {
111 | 		t.Error(err)
112 | 	}
113 | 	defer app.Close()
114 | 	if err := app.DeleteAllExamples(); err != nil {
115 | 		t.Error("Cannot delete examples")
116 | 	}
117 | 
118 | 	e1 := example.NewExample("http://hoge1.com", model.POSITIVE)
119 | 	e2 := example.NewExample("http://hoge2.com", model.NEGATIVE)
120 | 	e3 := example.NewExample("http://hoge3.com", model.UNLABELED)
121 | 	examples := model.Examples{e1, e2, e3}
122 | 	for _, e := range examples {
123 | 		err = app.UpdateOrCreateExample(e)
124 | 		if err != nil {
125 | 			t.Error(err)
126 | 		}
127 | 	}
128 | 
129 | 	listName := "general"
130 | 	err = app.UpdateRecommendation(listName, examples)
131 | 	if err != nil {
132 | 		t.Error(err)
133 | 	}
134 | 	examples, err = app.GetRecommendation(listName)
135 | 	if err != nil {
136 | 		t.Error(err)
137 | 	}
138 | 	if len(examples) != 3 {
139 | 		t.Errorf("len(examples) should be 3, but %d", len(examples))
140 | 	}
141 | }
142 | 


--------------------------------------------------------------------------------
/lib/service/service.go:
--------------------------------------------------------------------------------
  1 | package service
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"io"
  6 | 	"time"
  7 | 
  8 | 	"github.com/syou6162/go-active-learning/lib/classifier"
  9 | 	"github.com/syou6162/go-active-learning/lib/model"
 10 | 	"github.com/syou6162/go-active-learning/lib/repository"
 11 | )
 12 | 
 13 | type GoActiveLearningApp interface {
 14 | 	UpdateOrCreateExample(e *model.Example) error
 15 | 	UpdateScore(e *model.Example) error
 16 | 	InsertExampleFromScanner(scanner *bufio.Scanner) (*model.Example, error)
 17 | 	InsertExamplesFromReader(reader io.Reader) error
 18 | 	SearchExamples() (model.Examples, error)
 19 | 	SearchRecentExamples(from time.Time, limit int) (model.Examples, error)
 20 | 	SearchRecentExamplesByHost(host string, from time.Time, limit int) (model.Examples, error)
 21 | 	SearchExamplesByLabel(label model.LabelType, limit int) (model.Examples, error)
 22 | 	SearchLabeledExamples(limit int) (model.Examples, error)
 23 | 	SearchPositiveExamples(limit int) (model.Examples, error)
 24 | 	SearchNegativeExamples(limit int) (model.Examples, error)
 25 | 	SearchUnlabeledExamples(limit int) (model.Examples, error)
 26 | 	SearchPositiveScoredExamples(limit int) (model.Examples, error)
 27 | 	FindExampleByUlr(url string) (*model.Example, error)
 28 | 	FindExampleById(id int) (*model.Example, error)
 29 | 	SearchExamplesByUlrs(urls []string) (model.Examples, error)
 30 | 	SearchExamplesByIds(ids []int) (model.Examples, error)
 31 | 	SearchExamplesByKeywords(keywords []string, aggregator string, limit int) (model.Examples, error)
 32 | 	DeleteAllExamples() error
 33 | 	CountPositiveExamples() (int, error)
 34 | 	CountNegativeExamples() (int, error)
 35 | 	CountUnlabeledExamples() (int, error)
 36 | 
 37 | 	InsertMIRAModel(m classifier.MIRAClassifier) error
 38 | 	FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error)
 39 | 
 40 | 	UpdateFeatureVector(e *model.Example) error
 41 | 	UpdateHatenaBookmark(e *model.Example) error
 42 | 	UpdateOrCreateReferringTweets(e *model.Example) error
 43 | 	UpdateTweetLabel(exampleId int, idStr string, label model.LabelType) error
 44 | 	SearchReferringTweets(limit int) (model.ReferringTweets, error)
 45 | 	SearchPositiveReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
 46 | 	SearchNegativeReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
 47 | 	SearchUnlabeledReferringTweets(scoreThreshold float64, tweetsLimitInSameExample int, limit int) (model.ReferringTweets, error)
 48 | 	SearchRecentReferringTweetsWithHighScore(from time.Time, scoreThreshold float64, limit int) (model.ReferringTweets, error)
 49 | 	Fetch(examples model.Examples)
 50 | 
 51 | 	AttachMetadataIncludingFeatureVector(examples model.Examples, bookmarkLimit int, tweetLimit int) error
 52 | 	AttachMetadata(examples model.Examples, bookmarkLimit, tweetLimit int) error
 53 | 
 54 | 	UpdateRecommendation(listName string, examples model.Examples) error
 55 | 	GetRecommendation(listName string) (model.Examples, error)
 56 | 
 57 | 	UpdateRelatedExamples(related model.RelatedExamples) error
 58 | 	SearchRelatedExamples(e *model.Example) (model.Examples, error)
 59 | 
 60 | 	UpdateTopAccessedExampleIds(exampleIds []int) error
 61 | 	SearchTopAccessedExamples() (model.Examples, error)
 62 | 
 63 | 	Ping() error
 64 | 	Close() error
 65 | }
 66 | 
 67 | func NewApp(repo repository.Repository) GoActiveLearningApp {
 68 | 	return &goActiveLearningApp{repo: repo}
 69 | }
 70 | 
 71 | func NewDefaultApp() (GoActiveLearningApp, error) {
 72 | 	repo, err := repository.New()
 73 | 	if err != nil {
 74 | 		return nil, err
 75 | 	}
 76 | 	return &goActiveLearningApp{repo: repo}, nil
 77 | }
 78 | 
 79 | type goActiveLearningApp struct {
 80 | 	repo repository.Repository
 81 | }
 82 | 
 83 | func (app *goActiveLearningApp) InsertMIRAModel(m classifier.MIRAClassifier) error {
 84 | 	return app.repo.InsertMIRAModel(m)
 85 | }
 86 | 
 87 | func (app *goActiveLearningApp) FindLatestMIRAModel(modelType classifier.ModelType) (*classifier.MIRAClassifier, error) {
 88 | 	return app.repo.FindLatestMIRAModel(modelType)
 89 | }
 90 | 
 91 | func (app *goActiveLearningApp) Ping() error {
 92 | 	if err := app.repo.Ping(); err != nil {
 93 | 		return err
 94 | 	}
 95 | 	return nil
 96 | }
 97 | 
 98 | func (app *goActiveLearningApp) Close() error {
 99 | 	if err := app.repo.Close(); err != nil {
100 | 		return err
101 | 	}
102 | 	return nil
103 | }
104 | 


--------------------------------------------------------------------------------
/lib/top_accessed_example/top_accessed_example.go:
--------------------------------------------------------------------------------
 1 | package top_accessed_example
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"strconv"
 7 | 
 8 | 	"os"
 9 | 
10 | 	"github.com/syou6162/go-active-learning/lib/service"
11 | 	"github.com/urfave/cli"
12 | )
13 | 
14 | func parseLine(line string) (int, error) {
15 | 	exampleId, err := strconv.ParseInt(line, 10, 0)
16 | 	if err != nil {
17 | 		return 0, fmt.Errorf("Invalid line: %s", line)
18 | 	}
19 | 	return int(exampleId), nil
20 | }
21 | 
22 | func readTopAccessedExampleIds(filename string) ([]int, error) {
23 | 	fp, err := os.Open(filename)
24 | 	defer fp.Close()
25 | 	if err != nil {
26 | 		return nil, err
27 | 	}
28 | 
29 | 	exampleIds := make([]int, 0)
30 | 	scanner := bufio.NewScanner(fp)
31 | 	for scanner.Scan() {
32 | 		line := scanner.Text()
33 | 		exampleId, err := parseLine(line)
34 | 		if err != nil {
35 | 			return nil, err
36 | 		}
37 | 		exampleIds = append(exampleIds, exampleId)
38 | 	}
39 | 	if err := scanner.Err(); err != nil {
40 | 		return nil, err
41 | 	}
42 | 	return exampleIds, nil
43 | }
44 | 
45 | func doAddTopAccessedExamples(c *cli.Context) error {
46 | 	inputFilename := c.String("input-filename")
47 | 
48 | 	if inputFilename == "" {
49 | 		_ = cli.ShowCommandHelp(c, "add-top-accessed-examples")
50 | 		return cli.NewExitError("`input-filename` is a required field.", 1)
51 | 	}
52 | 
53 | 	app, err := service.NewDefaultApp()
54 | 	if err != nil {
55 | 		return err
56 | 	}
57 | 	defer app.Close()
58 | 
59 | 	exampleIds, err := readTopAccessedExampleIds(inputFilename)
60 | 	if err != nil {
61 | 		return err
62 | 	}
63 | 	err = app.UpdateTopAccessedExampleIds(exampleIds)
64 | 	if err != nil {
65 | 		return err
66 | 	}
67 | 	return nil
68 | }
69 | 
70 | var CommandAddTopAccessedExamples = cli.Command{
71 | 	Name:  "add-top-accessed-examples",
72 | 	Usage: "add top accessed examples",
73 | 	Description: `
74 | Add top accessed examples.
75 | `,
76 | 	Action: doAddTopAccessedExamples,
77 | 	Flags: []cli.Flag{
78 | 		cli.StringFlag{Name: "input-filename"},
79 | 	},
80 | }
81 | 


--------------------------------------------------------------------------------
/lib/util/converter/converter.go:
--------------------------------------------------------------------------------
 1 | package converter
 2 | 
 3 | import "github.com/syou6162/go-active-learning/lib/model"
 4 | import "github.com/syou6162/go-active-learning/lib/classifier"
 5 | 
 6 | func ConvertExamplesToLearningInstances(examples model.Examples) classifier.LearningInstances {
 7 | 	instances := classifier.LearningInstances{}
 8 | 	for _, e := range examples {
 9 | 		instances = append(instances, e)
10 | 	}
11 | 	return instances
12 | }
13 | 


--------------------------------------------------------------------------------
/lib/util/file/file.go:
--------------------------------------------------------------------------------
 1 | package file
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"errors"
 6 | 	"fmt"
 7 | 	"os"
 8 | 	"strconv"
 9 | 	"strings"
10 | 
11 | 	"github.com/syou6162/go-active-learning/lib/example"
12 | 	"github.com/syou6162/go-active-learning/lib/model"
13 | )
14 | 
15 | func ParseLine(line string) (*model.Example, error) {
16 | 	tokens := strings.Split(line, "\t")
17 | 	var url string
18 | 	if len(tokens) == 1 {
19 | 		url = tokens[0]
20 | 		return example.NewExample(url, model.UNLABELED), nil
21 | 	} else if len(tokens) == 2 {
22 | 		url = tokens[0]
23 | 		label, _ := strconv.ParseInt(tokens[1], 10, 0)
24 | 		switch model.LabelType(label) {
25 | 		case model.POSITIVE, model.NEGATIVE, model.UNLABELED:
26 | 			return example.NewExample(url, model.LabelType(label)), nil
27 | 		default:
28 | 			return nil, errors.New(fmt.Sprintf("Invalid Label type %d in %s", label, line))
29 | 		}
30 | 	} else {
31 | 		return nil, errors.New(fmt.Sprintf("Invalid line: %s", line))
32 | 	}
33 | }
34 | 
35 | func ReadExamples(filename string) ([]*model.Example, error) {
36 | 	fp, err := os.Open(filename)
37 | 	defer fp.Close()
38 | 	if err != nil {
39 | 		return nil, err
40 | 	}
41 | 
42 | 	scanner := bufio.NewScanner(fp)
43 | 	var examples model.Examples
44 | 	for scanner.Scan() {
45 | 		line := scanner.Text()
46 | 		e, err := ParseLine(line)
47 | 		if err != nil {
48 | 			return nil, err
49 | 		}
50 | 		examples = append(examples, e)
51 | 	}
52 | 	if err := scanner.Err(); err != nil {
53 | 		return nil, err
54 | 	}
55 | 	return examples, nil
56 | }
57 | 
58 | func WriteExamples(examples model.Examples, filename string) error {
59 | 	fp, err := os.Create(filename)
60 | 	defer fp.Close()
61 | 	if err != nil {
62 | 		return err
63 | 	}
64 | 
65 | 	writer := bufio.NewWriter(fp)
66 | 	for _, e := range examples {
67 | 		if e.IsNew && e.IsLabeled() {
68 | 			url := e.FinalUrl
69 | 			if url == "" {
70 | 				url = e.Url
71 | 			}
72 | 			_, err := writer.WriteString(url + "\t" + strconv.Itoa(int(e.Label)) + "\n")
73 | 			if err != nil {
74 | 				return err
75 | 			}
76 | 		}
77 | 	}
78 | 
79 | 	writer.Flush()
80 | 	return nil
81 | }
82 | 


--------------------------------------------------------------------------------
/lib/util/file/file_test.go:
--------------------------------------------------------------------------------
 1 | package file
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/syou6162/go-active-learning/lib/example"
 8 | 	"github.com/syou6162/go-active-learning/lib/model"
 9 | )
10 | 
11 | func TestParseLine(t *testing.T) {
12 | 	line1 := "http://model.com\t1"
13 | 	e, err := ParseLine(line1)
14 | 
15 | 	if err != nil {
16 | 		t.Error("cannot parse line1")
17 | 	}
18 | 	if e.Label != model.POSITIVE {
19 | 		t.Error("Label must be POSITIVE")
20 | 	}
21 | 
22 | 	line2 := "http://model.com\t-1"
23 | 	e, err = ParseLine(line2)
24 | 
25 | 	if err != nil {
26 | 		t.Error("cannot parse line2")
27 | 	}
28 | 	if e.Label != model.NEGATIVE {
29 | 		t.Error("Label must be NEGATIVE")
30 | 	}
31 | 
32 | 	line3 := "http://model.com"
33 | 	e, err = ParseLine(line3)
34 | 
35 | 	if err != nil {
36 | 		t.Error("cannot parse line3")
37 | 	}
38 | 	if e.Label != model.UNLABELED {
39 | 		t.Error("Label must be UNLABELED")
40 | 	}
41 | 
42 | 	line4 := "http://model.com\t2"
43 | 	e, err = ParseLine(line4)
44 | 
45 | 	if e != nil {
46 | 		t.Error("wrong line format")
47 | 	}
48 | }
49 | 
50 | func TestReadExamples(t *testing.T) {
51 | 	filename := "../../../tech_input_example.txt"
52 | 	examples, err := ReadExamples(filename)
53 | 
54 | 	if err != nil {
55 | 		fmt.Println(err.Error())
56 | 		t.Error(fmt.Printf("Cannot read examples from %s\n", filename))
57 | 	}
58 | 	if len(examples) == 0 {
59 | 		t.Error(fmt.Printf("%s should contain more than one examples\n", filename))
60 | 	}
61 | }
62 | 
63 | func TestWriteExamples(t *testing.T) {
64 | 	filename := ".write_test.txt"
65 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
66 | 	e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
67 | 
68 | 	err := WriteExamples(model.Examples{e1, e2}, filename)
69 | 	if err != nil {
70 | 		t.Error(fmt.Printf("Cannot write examples to %s", filename))
71 | 	}
72 | 
73 | 	examples, err := ReadExamples(filename)
74 | 	if err != nil {
75 | 		t.Error(fmt.Printf("Cannot read examples from %s", filename))
76 | 	}
77 | 	if len(examples) == 2 {
78 | 		t.Error(fmt.Printf("%s should contain two examples", filename))
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/lib/util/util.go:
--------------------------------------------------------------------------------
  1 | package util
  2 | 
  3 | import (
  4 | 	"os"
  5 | 
  6 | 	"github.com/syou6162/go-active-learning/lib/model"
  7 | )
  8 | 
  9 | func FilterLabeledExamples(examples model.Examples) model.Examples {
 10 | 	var result model.Examples
 11 | 	for _, e := range examples {
 12 | 		if e.IsLabeled() {
 13 | 			result = append(result, e)
 14 | 		}
 15 | 	}
 16 | 	return result
 17 | }
 18 | 
 19 | func FilterUnlabeledExamples(examples model.Examples) model.Examples {
 20 | 	result := model.Examples{}
 21 | 
 22 | 	alreadyLabeledByURL := make(map[string]bool)
 23 | 	alreadyLabeledByTitle := make(map[string]bool)
 24 | 	for _, e := range FilterLabeledExamples(examples) {
 25 | 		alreadyLabeledByURL[e.Url] = true
 26 | 		alreadyLabeledByURL[e.FinalUrl] = true
 27 | 		alreadyLabeledByTitle[e.Title] = true
 28 | 	}
 29 | 
 30 | 	for _, e := range examples {
 31 | 		if _, ok := alreadyLabeledByURL[e.Url]; ok {
 32 | 			continue
 33 | 		}
 34 | 		if _, ok := alreadyLabeledByTitle[e.Title]; ok {
 35 | 			continue
 36 | 		}
 37 | 		if !e.IsLabeled() {
 38 | 			alreadyLabeledByURL[e.Url] = true
 39 | 			alreadyLabeledByURL[e.FinalUrl] = true
 40 | 			alreadyLabeledByTitle[e.Title] = true
 41 | 			result = append(result, e)
 42 | 		}
 43 | 	}
 44 | 	return result
 45 | }
 46 | 
 47 | func RemoveDuplicate(args []string) []string {
 48 | 	results := make([]string, 0)
 49 | 	encountered := map[string]bool{}
 50 | 	for i := 0; i < len(args); i++ {
 51 | 		if !encountered[args[i]] {
 52 | 			encountered[args[i]] = true
 53 | 			results = append(results, args[i])
 54 | 		}
 55 | 	}
 56 | 	return results
 57 | }
 58 | 
 59 | func FilterStatusCodeOkExamples(examples model.Examples) model.Examples {
 60 | 	result := model.Examples{}
 61 | 
 62 | 	for _, e := range examples {
 63 | 		if e.StatusCode == 200 {
 64 | 			result = append(result, e)
 65 | 		}
 66 | 	}
 67 | 
 68 | 	return result
 69 | }
 70 | 
 71 | func FilterStatusCodeNotOkExamples(examples model.Examples) model.Examples {
 72 | 	result := model.Examples{}
 73 | 
 74 | 	for _, e := range examples {
 75 | 		if e.StatusCode != 200 {
 76 | 			result = append(result, e)
 77 | 		}
 78 | 	}
 79 | 
 80 | 	return result
 81 | }
 82 | 
 83 | func RemoveExample(examples model.Examples, toBeRemoved model.Example) model.Examples {
 84 | 	result := model.Examples{}
 85 | 
 86 | 	for _, e := range examples {
 87 | 		if e.Url != toBeRemoved.Url {
 88 | 			result = append(result, e)
 89 | 		}
 90 | 	}
 91 | 
 92 | 	return result
 93 | }
 94 | 
 95 | func RemoveNegativeExamples(examples model.Examples) model.Examples {
 96 | 	result := model.Examples{}
 97 | 	for _, e := range examples {
 98 | 		if e.Label != model.NEGATIVE {
 99 | 			result = append(result, e)
100 | 		}
101 | 	}
102 | 	return result
103 | }
104 | 
105 | func UniqueByFinalUrl(examples model.Examples) model.Examples {
106 | 	result := model.Examples{}
107 | 	m := make(map[string]bool)
108 | 	for _, e := range examples {
109 | 		if !m[e.FinalUrl] {
110 | 			m[e.FinalUrl] = true
111 | 			result = append(result, e)
112 | 		}
113 | 	}
114 | 	return result
115 | }
116 | 
117 | func UniqueByTitle(examples model.Examples) model.Examples {
118 | 	result := model.Examples{}
119 | 	m := make(map[string]bool)
120 | 	for _, e := range examples {
121 | 		if !m[e.Title] {
122 | 			m[e.Title] = true
123 | 			result = append(result, e)
124 | 		}
125 | 	}
126 | 	return result
127 | }
128 | 
129 | func GetEnv(key, fallback string) string {
130 | 	value, ok := os.LookupEnv(key)
131 | 	if !ok {
132 | 		value = fallback
133 | 	}
134 | 	return value
135 | }
136 | 


--------------------------------------------------------------------------------
/lib/util/util_test.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/syou6162/go-active-learning/lib/example"
 7 | 	"github.com/syou6162/go-active-learning/lib/model"
 8 | )
 9 | 
10 | func TestFilterLabeledExamples(t *testing.T) {
11 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
12 | 	e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
13 | 	e3 := example.NewExample("http://google.com", model.UNLABELED)
14 | 
15 | 	examples := FilterLabeledExamples(model.Examples{e1, e2, e3})
16 | 	if len(examples) != 2 {
17 | 		t.Error("Number of labeled examples should be 2")
18 | 	}
19 | }
20 | 
21 | func TestFilterUnlabeledExamples(t *testing.T) {
22 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
23 | 	e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
24 | 	e3 := example.NewExample("http://google.com", model.UNLABELED)
25 | 	e3.Title = "Google"
26 | 
27 | 	examples := FilterUnlabeledExamples(model.Examples{e1, e2, e3})
28 | 	if len(examples) != 1 {
29 | 		t.Error("Number of unlabeled examples should be 1")
30 | 	}
31 | }
32 | 
33 | func TestFilterStatusCodeOkExamples(t *testing.T) {
34 | 	e1 := example.NewExample("https://b.hatena.ne.jp", model.POSITIVE)
35 | 	e1.StatusCode = 200
36 | 	e2 := example.NewExample("https://www.yasuhisay.info", model.NEGATIVE)
37 | 	e2.StatusCode = 404
38 | 	e3 := example.NewExample("http://google.com", model.UNLABELED)
39 | 	e3.StatusCode = 304
40 | 
41 | 	examples := FilterStatusCodeOkExamples(model.Examples{e1, e2, e3})
42 | 	if len(examples) != 1 {
43 | 		t.Error("Number of examples (status code = 200) should be 1")
44 | 	}
45 | }
46 | 
47 | func TestUniqueByFinalUrl(t *testing.T) {
48 | 	e1 := model.Example{FinalUrl: "aaa"}
49 | 	e2 := model.Example{FinalUrl: "bbb"}
50 | 	e3 := model.Example{FinalUrl: "aaa"}
51 | 	examples := model.Examples{&e1, &e2, &e3}
52 | 	result := UniqueByFinalUrl(examples)
53 | 	if len(result) != 2 {
54 | 		t.Errorf("length(result) should be %d, but %d", 2, len(result))
55 | 	}
56 | }
57 | 
58 | func TestRemoveDuplicate(t *testing.T) {
59 | 	args := []string{"hoge", "fuga", "piyo", "hoge"}
60 | 
61 | 	result := RemoveDuplicate(args)
62 | 	if len(result) != 3 {
63 | 		t.Error("Number of unique string in args should be 3")
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/syou6162/go-active-learning/lib/command"
 8 | 	"github.com/urfave/cli"
 9 | )
10 | 
11 | func main() {
12 | 	app := cli.NewApp()
13 | 	app.Name = "go-active-learning"
14 | 	app.Commands = command.Commands
15 | 
16 | 	if err := app.Run(os.Args); err != nil {
17 | 		fmt.Fprintln(os.Stderr, err)
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/migrations/0.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS example (
 3 |   "id" SERIAL NOT NULL PRIMARY KEY,
 4 |   "url" TEXT NOT NULL,
 5 |   "label" INT NOT NULL,
 6 |   "created_at" timestamp NOT NULL,
 7 |   "updated_at" timestamp NOT NULL
 8 | );
 9 | 
10 | CREATE UNIQUE INDEX IF NOT EXISTS "url_idx_example" ON example ("url");
11 | CREATE INDEX IF NOT EXISTS "label_updated_at_idx_example" ON example ("label", "updated_at" DESC);
12 | 
13 | -- +migrate Down
14 | DROP INDEX "url_idx_example";
15 | DROP INDEX "label_updated_at_idx_example";
16 | 
17 | DROP TABLE example;
18 | 


--------------------------------------------------------------------------------
/migrations/1.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | ALTER TABLE "example" ADD COLUMN "final_url" TEXT DEFAULT '' NOT NULL;
 3 | UPDATE "example" SET "final_url" = "url";
 4 | ALTER TABLE "example" ALTER COLUMN "final_url" DROP DEFAULT;
 5 | 
 6 | ALTER TABLE "example" ADD COLUMN "title" TEXT;
 7 | ALTER TABLE "example" ADD COLUMN "description" TEXT;
 8 | ALTER TABLE "example" ADD COLUMN "og_description" TEXT;
 9 | ALTER TABLE "example" ADD COLUMN "og_type" TEXT;
10 | ALTER TABLE "example" ADD COLUMN "og_image" TEXT;
11 | ALTER TABLE "example" ADD COLUMN "body" TEXT;
12 | ALTER TABLE "example" ADD COLUMN "score" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
13 | ALTER TABLE "example" ADD COLUMN "is_new" BOOLEAN DEFAULT FALSE NOT NULL;
14 | ALTER TABLE "example" ADD COLUMN "status_code" INT DEFAULT 0 NOT NULL;
15 | ALTER TABLE "example" ADD COLUMN "favicon" TEXT;
16 | 
17 | CREATE UNIQUE INDEX IF NOT EXISTS "final_url_idx_example" ON example ("final_url");
18 | 
19 | -- +migrate Down
20 | DROP INDEX "final_url_idx_example";
21 | 
22 | ALTER TABLE "example" DROP COLUMN "final_url";
23 | ALTER TABLE "example" DROP COLUMN "title";
24 | ALTER TABLE "example" DROP COLUMN "description";
25 | ALTER TABLE "example" DROP COLUMN "og_description";
26 | ALTER TABLE "example" DROP COLUMN "og_type";
27 | ALTER TABLE "example" DROP COLUMN "og_image";
28 | ALTER TABLE "example" DROP COLUMN "body";
29 | ALTER TABLE "example" DROP COLUMN "score";
30 | ALTER TABLE "example" DROP COLUMN "is_new";
31 | ALTER TABLE "example" DROP COLUMN "status_code";
32 | ALTER TABLE "example" DROP COLUMN "favicon";
33 | 


--------------------------------------------------------------------------------
/migrations/10.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS recommendation (
 3 |   "list_type" INT NOT NULL,
 4 |   "example_id" SERIAL NOT NULL,
 5 |    CONSTRAINT recommendation_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
 6 | );
 7 | 
 8 | CREATE INDEX IF NOT EXISTS "list_type_idx_recommendation" ON recommendation ("list_type");
 9 | 
10 | -- +migrate Down
11 | DROP INDEX "list_type_idx_recommendation";
12 | 
13 | DROP TABLE recommendation;
14 | 


--------------------------------------------------------------------------------
/migrations/11.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "example" ADD COLUMN "error_count" INT NOT NULL DEFAULT 0;
3 | 
4 | -- +migrate Down
5 | ALTER TABLE "example" DROP COLUMN "error_count";
6 | 


--------------------------------------------------------------------------------
/migrations/12.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" ADD COLUMN "label" INT NOT NULL DEFAULT 0;
3 | 
4 | -- +migrate Down
5 | ALTER TABLE "tweet" DROP COLUMN "label";
6 | 


--------------------------------------------------------------------------------
/migrations/13.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | ALTER TABLE "model" ADD COLUMN "model_type" INT NOT NULL DEFAULT 0;
 3 | ALTER TABLE "model" ADD COLUMN "c" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
 4 | ALTER TABLE "model" ADD COLUMN "accuracy" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
 5 | ALTER TABLE "model" ADD COLUMN "precision" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
 6 | ALTER TABLE "model" ADD COLUMN "recall" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
 7 | ALTER TABLE "model" ADD COLUMN "fvalue" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
 8 | 
 9 | DROP INDEX "created_at_model";
10 | CREATE INDEX IF NOT EXISTS "model_type_created_at_model" ON model ("model_type", "created_at");
11 | 
12 | -- +migrate Down
13 | DROP INDEX "model_type_created_at_model";
14 | 
15 | ALTER TABLE "model" DROP COLUMN "model_type";
16 | ALTER TABLE "model" DROP COLUMN "c";
17 | ALTER TABLE "model" DROP COLUMN "accuracy";
18 | ALTER TABLE "model" DROP COLUMN "precision";
19 | ALTER TABLE "model" DROP COLUMN "recall";
20 | ALTER TABLE "model" DROP COLUMN "fvalue";
21 | 
22 | CREATE INDEX IF NOT EXISTS "created_at_model" ON model ("created_at");
23 | 


--------------------------------------------------------------------------------
/migrations/14.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" ADD COLUMN "score" DOUBLE PRECISION DEFAULT 0.0 NOT NULL;
3 | 
4 | -- +migrate Down
5 | ALTER TABLE "tweet" DROP COLUMN "score";
6 | 


--------------------------------------------------------------------------------
/migrations/15.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS related_example (
 3 |   "example_id" SERIAL NOT NULL,
 4 |   "related_example_id" SERIAL NOT NULL,
 5 |   CONSTRAINT related_example_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE,
 6 |   CONSTRAINT related_example_related_example_id_fkey FOREIGN KEY ("related_example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE,
 7 |   CHECK(example_id != related_example_id)
 8 | );
 9 | 
10 | CREATE INDEX IF NOT EXISTS "example_id_idx_related_example" ON related_example ("example_id");
11 | 
12 | -- +migrate Down
13 | DROP INDEX "example_id_idx_related_example";
14 | 
15 | DROP TABLE related_example;
16 | 


--------------------------------------------------------------------------------
/migrations/16.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS top_accessed_example (
 3 |   "example_id" SERIAL NOT NULL,
 4 |   CONSTRAINT top_accessed_example_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
 5 | );
 6 | 
 7 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_idx_top_accessed_example" ON top_accessed_example ("example_id");
 8 | 
 9 | -- +migrate Down
10 | DROP INDEX "example_id_idx_top_accessed_example";
11 | 
12 | DROP TABLE top_accessed_example;
13 | 


--------------------------------------------------------------------------------
/migrations/2.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS feature (
 3 |   "example_id" SERIAL NOT NULL,
 4 |   "feature" TEXT NOT NULL,
 5 |   CONSTRAINT feature_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
 6 | );
 7 | 
 8 | CREATE INDEX IF NOT EXISTS "example_id_idx_example" ON feature ("example_id");
 9 | 
10 | -- +migrate Down
11 | DROP INDEX "example_id_idx_example";
12 | DROP TABLE feature;
13 | 


--------------------------------------------------------------------------------
/migrations/3.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | ALTER TABLE "example" ALTER COLUMN "title" SET DEFAULT '';
 3 | ALTER TABLE "example" ALTER COLUMN "description" SET DEFAULT '';
 4 | ALTER TABLE "example" ALTER COLUMN "og_description" SET DEFAULT '';
 5 | ALTER TABLE "example" ALTER COLUMN "og_type" SET DEFAULT '';
 6 | ALTER TABLE "example" ALTER COLUMN "og_image" SET DEFAULT '';
 7 | ALTER TABLE "example" ALTER COLUMN "body" SET DEFAULT '';
 8 | ALTER TABLE "example" ALTER COLUMN "favicon" SET DEFAULT '';
 9 | 
10 | -- +migrate Down
11 | ALTER TABLE "example" ALTER COLUMN "title" DROP DEFAULT;
12 | ALTER TABLE "example" ALTER COLUMN "description" DROP DEFAULT;
13 | ALTER TABLE "example" ALTER COLUMN "og_description" DROP DEFAULT;
14 | ALTER TABLE "example" ALTER COLUMN "og_type" DROP DEFAULT;
15 | ALTER TABLE "example" ALTER COLUMN "og_image" DROP DEFAULT;
16 | ALTER TABLE "example" ALTER COLUMN "body" DROP DEFAULT;
17 | ALTER TABLE "example" ALTER COLUMN "favicon" DROP DEFAULT;
18 | 


--------------------------------------------------------------------------------
/migrations/4.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | DROP INDEX "final_url_idx_example";
3 | 
4 | -- +migrate Down
5 | CREATE UNIQUE INDEX IF NOT EXISTS "final_url_idx_example" ON example ("final_url");
6 | 


--------------------------------------------------------------------------------
/migrations/5.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS hatena_bookmark (
 3 |   "id" SERIAL NOT NULL PRIMARY KEY,
 4 |   "example_id" SERIAL NOT NULL,
 5 |   "title" TEXT NOT NULL,
 6 |   "screenshot" TEXT NOT NULL,
 7 |   "entry_url" TEXT NOT NULL,
 8 |   "count" INT NOT NULL,
 9 |   "url" TEXT NOT NULL,
10 |   "eid" TEXT NOT NULL,
11 |   CONSTRAINT hatena_bookmark_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
12 | );
13 | 
14 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_idx_hatena_bookmark" ON hatena_bookmark ("example_id");
15 | CREATE UNIQUE INDEX IF NOT EXISTS "url_idx_hatena_bookmark" ON hatena_bookmark ("url");
16 | 
17 | CREATE TABLE IF NOT EXISTS bookmark (
18 |   "hatena_bookmark_id" SERIAL NOT NULL,
19 |   "user" TEXT NOT NULL,
20 |   "comment" TEXT NOT NULL,
21 |   "tags" TEXT NOT NULL,
22 |   "timestamp" timestamp NOT NULL,
23 |   CONSTRAINT bookmark_hatena_bookmark_id_fkey FOREIGN KEY ("hatena_bookmark_id") REFERENCES hatena_bookmark("id") ON UPDATE NO ACTION ON DELETE CASCADE
24 | );
25 | 
26 | CREATE UNIQUE INDEX IF NOT EXISTS "hatena_bookmark_id_user_idx_bookmark" ON bookmark ("hatena_bookmark_id", "user");
27 | 
28 | -- +migrate Down
29 | DROP INDEX "hatena_bookmark_id_user_idx_bookmark";
30 | DROP INDEX "example_id_idx_hatena_bookmark";
31 | DROP INDEX "url_idx_hatena_bookmark";
32 | 
33 | DROP TABLE bookmark;
34 | DROP TABLE hatena_bookmark;
35 | 


--------------------------------------------------------------------------------
/migrations/6.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS tweet (
 3 |   "id" SERIAL NOT NULL PRIMARY KEY,
 4 |   "example_id" SERIAL NOT NULL,
 5 | 
 6 |   "created_at" timestamp NOT NULL,
 7 |   "id_str" TEXT NOT NULL,
 8 |   "full_text" TEXT NOT NULL,
 9 |   "favorite_count" INT NOT NULL,
10 |   "retweet_count" INT NOT NULL,
11 |   "lang" TEXT NOT NULL,
12 | 
13 |   "screen_name" TEXT NOT NULL,
14 |   "name" TEXT NOT NULL,
15 |   "profile_image_url" TEXT NOT NULL,
16 | 
17 |   CONSTRAINT tweet_example_id_fkey FOREIGN KEY ("example_id") REFERENCES example("id") ON UPDATE NO ACTION ON DELETE CASCADE
18 | );
19 | 
20 | CREATE INDEX IF NOT EXISTS "example_id_idx_tweet" ON tweet ("example_id");
21 | CREATE UNIQUE INDEX IF NOT EXISTS "example_id_id_str_idx_tweet" ON tweet ("example_id", "id_str");
22 | 
23 | -- +migrate Down
24 | DROP INDEX "example_id_id_str_idx_tweet";
25 | DROP INDEX "example_id_idx_tweet";
26 | DROP TABLE tweet;
27 | 


--------------------------------------------------------------------------------
/migrations/7.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" ADD COLUMN "retweeted" BOOLEAN NOT NULL DEFAULT false;
3 | 
4 | -- +migrate Down
5 | ALTER TABLE "tweet" DROP COLUMN "retweeted";
6 | 


--------------------------------------------------------------------------------
/migrations/8.sql:
--------------------------------------------------------------------------------
1 | -- +migrate Up
2 | ALTER TABLE "tweet" DROP COLUMN "retweeted";
3 | 
4 | -- +migrate Down
5 | ALTER TABLE "tweet" ADD COLUMN "retweeted" BOOLEAN NOT NULL DEFAULT false;
6 | 


--------------------------------------------------------------------------------
/migrations/9.sql:
--------------------------------------------------------------------------------
 1 | -- +migrate Up
 2 | CREATE TABLE IF NOT EXISTS model (
 3 |   "model" TEXT NOT NULL,
 4 |   "created_at" timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
 5 | );
 6 | 
 7 | CREATE INDEX IF NOT EXISTS "created_at_model" ON model ("created_at");
 8 | 
 9 | -- +migrate Down
10 | DROP INDEX "created_at_model";
11 | 
12 | DROP TABLE model;
13 | 


--------------------------------------------------------------------------------
/script/create_database.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE "go-active-learning";
2 | CREATE DATABASE "go-active-learning-test";
3 | 
4 | CREATE ROLE "nobody" WITH PASSWORD 'nobody' LOGIN;
5 | 
6 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO nobody;
7 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO nobody;
8 | 


--------------------------------------------------------------------------------
/tech_input_example.txt:
--------------------------------------------------------------------------------
  1 | https://www.yasuhisay.info/	1
  2 | https://songmu.jp/riji/	1
  3 | https://blog.yuuk.io/	1
  4 | https://hakobe932.hatenablog.com/	1
  5 | https://motemen.hatenablog.com/	1
  6 | https://www3.nhk.or.jp/news/	-1
  7 | https://www.facebook.com/	-1
  8 | http://r.gnavi.co.jp/g-interview/	-1
  9 | https://suumo.jp/town/	-1
 10 | https://srdk.rakuten.jp/
 11 | https://kuenishi.hatenadiary.jp/entry/2017/05/25/005527
 12 | http://otiai10.hatenablog.com/entry/2017/05/24/163701
 13 | https://www.yasuhisay.info/entry/2016/11/23/000000
 14 | https://www.yasuhisay.info/entry/20090516/1242480413
 15 | https://www.yasuhisay.info/entry/2017/05/18/080000
 16 | https://arxiv.org/abs/1906.03776	1
 17 | https://tech-blog.optim.co.jp/entry/2019/07/04/173000	1
 18 | http://www.ai-gakkai.or.jp/my-bookmark_vol34-no4/	1
 19 | https://data.gunosy.io/entry/poincare_embedding_for_recommendations	1
 20 | http://englishforhackers.com/	-1
 21 | https://www.youtube.com/watch?v=5ZwknHMf1yo	-1
 22 | https://speakerdeck.com/livesense/shi-ye-heng-duan-zu-zhi-defalsemlsisutemukai-fa-yun-yong-toji-pan-she-ji	1
 23 | https://www.yasuhisay.info/entry/splatoon2_udemae_x	-1
 24 | https://www.yasuhisay.info/entry/2018/11/13/090000	-1
 25 | https://www.yasuhisay.info/entry/2016/09/26/080000	-1
 26 | https://www.yasuhisay.info/entry/2016/03/27/215344	-1
 27 | https://www.yasuhisay.info/entry/20110714/1310622171	-1
 28 | https://anond.hatelabo.jp/20190713043218	-1
 29 | https://cybozushiki.cybozu.co.jp/articles/m005412.html	-1
 30 | https://sorazine.soracom.jp/entry/2019/07/12/celestehair	-1
 31 | https://www.yasuhisay.info/entry/20090516/1242480413	1
 32 | https://www.yasuhisay.info/entry/kaggle_avazu_ctr_prediction	1
 33 | https://www.yasuhisay.info/entry/mlct_mackerel_anomaly_detection	1
 34 | https://www.yasuhisay.info/entry/2018/10/04/201000	1
 35 | https://honz.jp/articles/-/45278	-1
 36 | https://www.megamouth.info/entry/2019/07/12/175250	-1
 37 | https://www.lifehacker.jp/2019/07/193679_higedanshaku.html	-1
 38 | https://teineini.net/20190711-evernote-dokusyonote/	-1
 39 | https://dev.classmethod.jp/tool/be-vimmer-by-trainings/	1
 40 | https://www.clear-code.com/blog/2019/7/12.html	1
 41 | https://techlife.cookpad.com/entry/2019/07/13/055601	1
 42 | https://future-architect.github.io/articles/20190713/	1
 43 | https://blog.craftz.dog/my-dev-workflow-using-tmux-vim-48f73cc4f39e	1
 44 | https://junkyard.song.mu/slides/gocon2019-fukuoka/	1
 45 | https://nykergoto.hatenablog.jp/entry/2019/07/09/FFT_を使った時系列データ解析	1
 46 | http://memorability.csail.mit.edu/index.html	1
 47 | https://ynd.co/blog/tensorflow-vs-pytorch/	1
 48 | https://cloudplatform-jp.googleblog.com/2019/07/analyze-bigquery-data-with-kaggle-kernels-notebooks.html	1
 49 | https://omedstu.jimdo.com/2019/07/05/force法によるrecurrent-spiking-neural-networksの教師あり学習/	1
 50 | https://buildersbox.corp-sansan.com/entry/2019/07/12/110000	1
 51 | https://www.slideshare.net/shunsukekozawa5/gunosy-152302982	1
 52 | https://ml-loft.connpass.com/event/136426/	1
 53 | https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/	1
 54 | https://blog.ml.cmu.edu/2019/08/02/regret-circuits-composability-of-regret-minimizers/	1
 55 | https://tech-blog.abeja.asia/entry/noisy-label-ml-survey	1
 56 | https://imas.connpass.com/event/140272/	-1
 57 | http://alissak.hatenablog.com/entry/2019/08/03/170413	-1
 58 | https://ai.facebook.com/blog/advances-in-conversational-ai/	1
 59 | https://heartbeat.fritz.ai/a-2019-guide-to-semantic-segmentation-ca8242f5a7fc	1
 60 | http://morningproject.hateblo.jp/entry/2019/08/03/112605	-1
 61 | https://gendai.ismedia.jp/articles/-/66255	-1
 62 | https://oreno-yuigon.hatenablog.com/entry/2019/08/02/143028	-1
 63 | https://toyokeizai.net/articles/-/295714	-1
 64 | https://www.jigowatt121.com/entry/2019/08/03/183756	-1
 65 | https://shogi.zukeran.org/2019/07/31/konsen-1/	-1
 66 | https://note.mu/futashika/n/n382a4780b8bd	-1
 67 | https://www.hotpepper.jp/mesitsu/entry/hiro-watanabe/19-00148	-1
 68 | https://srdk.rakuten.jp/entry/2019/08/02/103000	-1
 69 | https://blog.hatenablog.com/entry/2019/07/26/180000	-1
 70 | https://www.huffingtonpost.jp/entry/oomura-conference_jp_5d454be5e4b0aca3411e2fe0	-1
 71 | https://togetter.com/li/1383361	-1
 72 | https://biz-journal.jp/2019/08/post_112573.html	-1
 73 | https://www.mofumofu.pink/entry/2019/08/03/144340	-1
 74 | https://nlab.itmedia.co.jp/nl/articles/1908/03/news013.html	-1
 75 | https://behappy.pink/fedelini-alla-puttanesca/	-1
 76 | https://helloandgoodbyecraft.com/jokes	-1
 77 | https://toyokeizai.net/articles/-/295293	-1
 78 | https://sirabee.com/2019/07/31/20162134165/	-1
 79 | https://www.around50-konkatsu.info/entry/2019/08/03/南の島へ現実逃避	-1
 80 | https://www7.ikutanpapa.com/entry/taketei	-1
 81 | https://datarobot.connpass.com/event/209149/	1
 82 | https://www.mediatechnology.jp/entry/2021/03/31/160000	1
 83 | https://openreview.net/forum?id=IrM64DGB21	1
 84 | https://github.com/intel-isl/DPT	1
 85 | https://blog.amedama.jp/entry/lgbm-data-size-vs-best-iters	1
 86 | https://recruit.gmo.jp/engineer/jisedai/blog/vision_transformer/	1
 87 | https://data.gunosy.io/entry/deim2021	1
 88 | https://logmi.jp/tech/articles/324141	1
 89 | https://qiita.com/jovyan/items/c41ab61a6b04e9a6e4df	1
 90 | https://github.com/manujosephv/pytorch_tabular	1
 91 | https://tech.retrieva.jp/entry/2021/04/01/114943	1
 92 | https://memo.sugyan.com/entry/2021/04/02/005434	1
 93 | https://rooftop.cc/news/2021/03/31160000.php	-1
 94 | https://www.tokio.inc/s/tokio/	-1
 95 | https://firego8.com/fire%e3%81%97%e3%81%be%e3%81%97%e3%81%9f%ef%bc%81	-1
 96 | https://comic-days.com/episode/3269632237302594670	-1
 97 | https://www.youtube.com/watch?v=oGEUZuicEYM	-1
 98 | https://www.youtube.com/watch?v=Cs_l0LIhg5M	-1
 99 | https://nlab.itmedia.co.jp/nl/articles/2104/01/news105.html	-1
100 | https://animeanime.jp/article/2021/04/02/60531.html	-1
101 | https://ja.kohsuke.org/%E3%82%BD%E3%83%95%E3%83%88%E3%82%A6%E3%82%A7%E3%82%A2%E9%96%8B%E7%99%BA/%E5%84%AA%E7%A7%80%E3%81%95%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6/	-1
102 | https://togetter.com/li/1691643	-1
103 | https://www.yasuhisay.info/entry/2021/02/12/090000	1
104 | https://www.yasuhisay.info/entry/2021/02/25/130000	1
105 | https://www.yasuhisay.info/entry/2021/03/12/114500	1
106 | https://www.yasuhisay.info/entry/2021/03/24/090000	1
107 | https://www.yasuhisay.info/entry/2021/03/25/083000	1
108 | https://www.yasuhisay.info/entry/2021/03/28/143000	1
109 | https://wapa5pow.com/posts/2021-03-31--day-one-in-project	1
110 | https://aws.amazon.com/jp/blogs/startup/tech-case-study-jp-startup-ai-ml/	1
111 | https://nhiroki.jp/2021/03/31/design-docs	1
112 | https://dev.classmethod.jp/articles/ways-to-check-fargate-cpu-usage/	1
113 | https://dev.classmethod.jp/articles/amazon-route-53-resolver-dns-firewall/	1
114 | https://scrapbox.io/mizdra/chrome_devtools_%E3%81%AE_tips_N%E9%80%A3%E7%99%BA	1
115 | https://zenn.dev/saboyutaka/articles/07f1351a6b0049	1
116 | 


--------------------------------------------------------------------------------