├── .gitignore
├── .gitmodules
├── .travis.yml
├── LICENSE
├── README.org
├── conf
    ├── defaul.conf
    ├── epl-ent.conf
    ├── epl-synth.conf
    ├── epl.conf
    ├── eval-epl.conf
    ├── eval-subsample.conf
    ├── eval.conf
    ├── mf-bpr.conf
    ├── mf-oc.conf
    ├── mf.conf
    ├── naacl2015-Joint.conf
    ├── naacl2015-MF.conf
    ├── naacl2015-Zero-Inf.conf
    ├── naacl2015-Zero-Joint.conf
    ├── naacl2015-Zero-MF.conf
    ├── naacl2015-Zero-Post.conf
    └── naacl2015-Zero-Pre.conf
├── data
    ├── eval
    │   ├── Set1.plt
    │   ├── eval.gpl
    │   ├── hist.gpl
    │   └── results.gpl
    ├── formulae
    │   └── filtered.txt
    └── naacl2013
    │   ├── naacl2013.gold.tsv
    │   ├── nyt-freebase.test.subsample-10000-LABELED.tuples.txt
    │   ├── nyt-freebase.test.subsample-10000.tuples.txt
    │   └── structured
    │       ├── eval-naacl13-structured.out.txt
    │       ├── test-mintz09.txt
    │       ├── test-riedel13-model-F.txt
    │       ├── test-riedel13-model-N.txt
    │       ├── test-riedel13-model-NF.txt
    │       ├── test-riedel13-model-NFE.txt
    │       ├── test-rockt-F.txt
    │       ├── test-surdeanu12.txt
    │       └── test-yao11.txt
├── overview.png
├── project
    ├── Build.scala
    └── plugins.sbt
└── src
    ├── main
        └── scala
        │   └── uclmr
        │       ├── AnnotationTool.scala
        │       ├── EmbeddedProbLogicEvaluation.scala
        │       ├── EntityAwareEvaluation.scala
        │       ├── EntityAwarePredictor.scala
        │       ├── FactorizationUtil.scala
        │       ├── GeometricMF.scala
        │       ├── LogicalInference.scala
        │       ├── MatrixFactorization.scala
        │       ├── OCSVM.scala
        │       ├── PimpMyFactorie.scala
        │       ├── ProbLogicEmbeddings.scala
        │       ├── SoftLogicPotentials.scala
        │       ├── TensorDB.scala
        │       ├── future
        │           ├── KB.scala
        │           └── MatrixFactorization2.scala
        │       ├── hack
        │           ├── CoNLLHackReader.scala
        │           └── MTShowcase.scala
        │       ├── io
        │           ├── FIGER.scala
        │           ├── FigerPB.scala
        │           ├── MatrixFilter.scala
        │           ├── NAACL.scala
        │           └── TSV.scala
        │       └── util
        │           ├── ArgMaxSigmoid.scala
        │           ├── DataInspector.scala
        │           ├── FormulaeAnnotator.scala
        │           ├── FormulaeExtractor.scala
        │           ├── OptimiseMatrixFactorizationHyperParameters.scala
        │           ├── Predictor.scala
        │           ├── SubsampleExperiments.scala
        │           └── VectorInspector.scala
    └── test
        └── scala
            └── uclmr
                ├── PotentialsSpec.scala
                └── TensorDBSpec.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # intellij
16 | .idea
17 | .idea/*
18 | .idea_modules
19 | !.idea/codeStyleSettings.xml
20 | *.iml


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "wolfe"]
2 | 	path = wolfe
3 | 	url = git@github.com:wolfe-pack/wolfe.git
4 | 	branch = dev
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | scala:
 3 |    - 2.10.4
 4 | jdk:
 5 |    - oraclejdk8
 6 | 
 7 | #script: "sbt clean scoverage:test"
 8 | script: "sbt test"
 9 | 
10 | #after_script:
11 | #
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013-2016 University College London
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | ** Low-rank Logic Embeddings
  2 | 
  3 | [[https://travis-ci.org/uclmr/low-rank-logic][https://travis-ci.org/uclmr/low-rank-logic.svg?branch=master]]
  4 | [[https://gitter.im/uclmr/low-rank-logic?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge][file:https://badges.gitter.im/Join%20Chat.svg]] 
  5 | [[https://waffle.io/uclmr/low-rank-logic][https://badge.waffle.io/uclmr/low-rank-logic.png?label=ready&title=Ready]]
  6 | 
  7 | This repository contains code accompanying the paper:
  8 | [[http://rockt.github.io/][Tim Rocktäschel]], [[http://sameersingh.org][Sameer Singh]] and [[http://www.riedelcastro.org/][Sebastian Riedel]]. _Injecting Logical Background Knowledge into Embeddings for Relation Extraction_. /in: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics – Human Language Technologies (NAACL HLT)/. 2015. [[[http://rockt.github.io/bib/rocktaschel2015injecting.bib][bib]]] [[[http://rockt.github.io/pdf/rocktaschel2015injecting.pdf][pdf]]]
  9 | 
 10 | 
 11 | [[./overview.png]]
 12 | 
 13 | 
 14 | *** Prerequisites
 15 | - [[http://git-scm.com/][git]] to clone the repository
 16 | - [[http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html][Java JDK]] and [[http://www.scala-sbt.org/][sbt]] to run code
 17 | - [[http://www.gnuplot.info/][gnuplot]] to reproduce evaluation plots
 18 | - pdflatex to reproduce pdf tables
 19 | 
 20 | *** Download code
 21 | **** Clone the repository
 22 | #+BEGIN_SRC sh :session mf :results silent
 23 | cd ~/workspace
 24 | git clone https://github.com/uclmr/low-rank-logic.git
 25 | git submodule update --init --recursive
 26 | cd wolfe
 27 | git checkout tags/v0.4.0
 28 | cd ..
 29 | #+END_SRC
 30 | 
 31 | **** COMMENT Move to the project directory
 32 | #+BEGIN_SRC sh :session mf :results silent
 33 | cd ~/workspace/low-rank-logic
 34 | #+END_SRC
 35 | 
 36 | **** Compile and test
 37 | #+BEGIN_SRC sh :session mf :results silent
 38 | sbt clean compile test
 39 | #+END_SRC
 40 | 
 41 | 
 42 | *** Download data
 43 | **** NAACL13
 44 | If you have a license for the NYT corpus please write us a mail to obtain the link to the =naacl2013.txt.zip= file.
 45 | 
 46 | ***** Move to resources
 47 | #+BEGIN_SRC sh :session mf :results silent
 48 | mv ~/Downloads/naacl2013.txt.zip ./src/main/resources
 49 | #+END_SRC
 50 | 
 51 | *** Comparison on Complete Data (Figure 3)
 52 | **** Matrix factorization
 53 | #+BEGIN_SRC sh :session mf :results silent
 54 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-MF.conf'
 55 | #+END_SRC
 56 | 
 57 | **** Joint Optimization (Section 3.2)
 58 | #+BEGIN_SRC sh :session mf :results silent
 59 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Joint.conf'
 60 | #+END_SRC
 61 | 
 62 | *** Zero-shot Relation Learning (Table 1)
 63 | - MF (matrix factorization)
 64 | #+BEGIN_SRC sh :session mf :results silent
 65 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-MF.conf'
 66 | #+END_SRC
 67 | - Inf (logical inference)
 68 | #+BEGIN_SRC sh :session mf :results silent
 69 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Inf.conf'
 70 | #+END_SRC
 71 | - Post (post-factorization inference)
 72 | #+BEGIN_SRC sh :session mf :results silent
 73 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Post.conf'
 74 | #+END_SRC
 75 | - Pre (pre-factorization inference)
 76 | #+BEGIN_SRC sh :session mf :results silent
 77 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Pre.conf'
 78 | #+END_SRC
 79 | - Joint (joint optimization)
 80 | #+BEGIN_SRC sh :session mf :results silent
 81 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Joint.conf'
 82 | #+END_SRC
 83 | 
 84 | **** Generating Figure
 85 | #+BEGIN_SRC sh :session mf :results silent
 86 | cd data/eval
 87 | tail -105 ../out/experiments.log > subsample.log
 88 | gnuplot -e 'fileName = "subsample"' eval.gpl 
 89 | open subsample-wMAP.pdf
 90 | #+END_SRC
 91 | 
 92 | *** Relations with Few Distant Labels (Figure 2)
 93 | #+BEGIN_SRC sh :session mf :results silent
 94 | sbt 'run-main uclmr.util.SubsampleExperiments 4'
 95 | #+END_SRC
 96 | =4= is the number of threads used to run experiments in parallel.
 97 | 
 98 | *** Evaluate predictions
 99 | #+BEGIN_SRC sh :session mf :results silent 
100 | sbt 'run-main uclmr.io.EvaluateNAACL ./conf/eval.conf ./data/out/latest/predict.txt'
101 | #+END_SRC
102 | 
103 | **** COMMENT Open PR curve 
104 | #+BEGIN_SRC sh :session mf :results silent 
105 | open ./data/out/latest/11pointPrecRecall.pdf
106 | #+END_SRC
107 | 
108 | **** COMMENT Open results table
109 | #+BEGIN_SRC sh :session mf :results silent 
110 | open ./data/out/latest/table.pdf
111 | #+END_SRC
112 | 
113 | *** Citing
114 | #+BEGIN_SRC latex
115 | @inproceedings{rocktaschel2015injecting,
116 |   title={{Injecting Logical Background Knowledge into Embeddings for Relation Extraction}},
117 |   author={Rockt{\"a}schel, Tim and Singh, Sameer and Riedel, Sebastian},
118 |   booktitle = {Conference of the North American Chapter of the Association for Computational Linguistics – Human Language Technologies (NAACL HLT)},
119 |   year={2015}
120 | }
121 | #+END_SRC
122 | 


--------------------------------------------------------------------------------
/conf/defaul.conf:
--------------------------------------------------------------------------------
1 | evalConf: "eval.conf"
2 | 


--------------------------------------------------------------------------------
/conf/epl-ent.conf:
--------------------------------------------------------------------------------
 1 | epl {
 2 |   train: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.train.tuples.txt"
 3 |   unlabeled: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.tuples.txt"
 4 |   test: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt"
 5 | 
 6 |   print-comparisons: false
 7 | 
 8 |   use-unlabeled: true
 9 |   use-entitites: false
10 | 
11 |   combine-datasets: false
12 |   min-rows: 20
13 |   min-cols: 2
14 |   min-cooccur: 1.0
15 |   weigh-terms: false
16 |   unit-ball:false
17 |   l2-dist: true
18 |   prior-repulsion: 0
19 | 
20 |   relation-dim: 20
21 |   subsample: 0.5
22 |   opt-iterations: 20
23 |   norm-b: true
24 |   trainer: "online"
25 |   ada-rate: 1.0
26 | 
27 |   scale-prior: 1.0
28 |   bias-prior: 0.0
29 |   mult-prior: 1.0
30 | 
31 |   reg-embed: 0.1
32 |   reg-scale: 0.1
33 |   reg-bias: 0.1
34 |   reg-mult: Infinity
35 | 
36 | }


--------------------------------------------------------------------------------
/conf/epl-synth.conf:
--------------------------------------------------------------------------------
 1 | epl {
 2 |   relation-dim: 1
 3 |   subsample: 1.0
 4 |   opt-iterations: 1000
 5 |   norm-b: true
 6 |   ada-rate: 1.0
 7 |   trainer: "online"
 8 |   unit-ball: false
 9 |   l2-dist: true
10 | 
11 |   scale-prior: 1.0
12 |   bias-prior: 0.0
13 |   mult-prior: 1.0
14 | 
15 |   reg-embed: 0.01
16 |   reg-scale: Infinity
17 |   reg-bias: 0.01
18 |   reg-mult: Infinity
19 | }


--------------------------------------------------------------------------------
/conf/epl.conf:
--------------------------------------------------------------------------------
 1 | epl {
 2 |   train: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.train.tuples.txt"
 3 |   unlabeled: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.tuples.txt"
 4 |   test: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt"
 5 | 
 6 |   print-comparisons: false
 7 | 
 8 |   use-unlabeled: true
 9 |   use-entitites: true
10 | 
11 |   combine-datasets: false
12 |   min-rows: 5
13 |   min-cols: 2
14 |   min-cooccur: 1.0
15 |   weigh-terms: false
16 |   unit-ball:false
17 |   l2-dist: true
18 |   prior-repulsion: 0
19 | 
20 |   relation-dim: 20
21 |   subsample: 0.5
22 |   opt-iterations: 100
23 |   norm-b: true
24 |   trainer: "online"
25 |   ada-rate: 1.0
26 | 
27 |   scale-prior: 1.0
28 |   bias-prior: 0.0
29 |   mult-prior: 1.0
30 | 
31 |   reg-embed: 0.1
32 |   reg-scale: 0.1
33 |   reg-bias: 0.1
34 |   reg-mult: Infinity
35 | 
36 | }


--------------------------------------------------------------------------------
/conf/eval-epl.conf:
--------------------------------------------------------------------------------
 1 | eval {
 2 |   extra-relations:       []
 3 |   targets:               ["person/company$","person/nationality$","team_owner/teams_owned$","company/founders$",
 4 |     "location/containedby$","neighborhood/neighborhood_of",
 5 |     "parent/child$","person/parents$","person/place_of_birth$","person/place_of_death$",
 6 |     "author/works_written$","team/arena_stadium$",
 7 |     "film/directed_by","roadcast/area_served$","structure/architect$",
 8 |     "composer/compositions$","sports_team/league$","person/religion$","film/produced_by$"
 9 |   ]
10 |   //gold:                  "data/2014.gold.tsv"
11 |   //gold:                  "data/annotations/latest.tsv"
12 |   gold:                  "data/sriedel-annotation/latest.tsv"
13 |   pool-depth:            100
14 |   run-depth:             1000
15 |   subsample:             "data/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt"
16 | }


--------------------------------------------------------------------------------
/conf/eval-subsample.conf:
--------------------------------------------------------------------------------
 1 | eval {
 2 |   extra-relations:       []
 3 |   targets:               ["person/company$","person/nationality$","company/founders$",
 4 |     "location/containedby$","neighborhood/neighborhood_of",
 5 |     "parent/child$","person/parents$","person/place_of_birth$","person/place_of_death$",
 6 |     "author/works_written$",
 7 |     "film/directed_by", "film/produced_by$"
 8 |   ]
 9 |   //gold:                  "data/2014.gold.tsv"
10 |   //gold:                  "data/annotations/latest.tsv"
11 |   gold:                  "data/naacl2013/naacl2013.gold.tsv"
12 |   pool-depth:            100
13 |   run-depth:             1000
14 |   subsample:             "data/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt"
15 | }


--------------------------------------------------------------------------------
/conf/eval.conf:
--------------------------------------------------------------------------------
 1 | eval {
 2 |   extra-relations:       []
 3 |   targets:               ["person/company$","person/nationality$","team_owner/teams_owned$","company/founders$",
 4 |     "location/containedby$","neighborhood/neighborhood_of",
 5 |     "parent/child$","person/parents$","person/place_of_birth$","person/place_of_death$",
 6 |     "author/works_written$","team/arena_stadium$",
 7 |     "film/directed_by","roadcast/area_served$","structure/architect$",
 8 |     "composer/compositions$","sports_team/league$","person/religion$","film/produced_by$"
 9 |   ]
10 |   //gold:                  "data/2014.gold.tsv"
11 |   //gold:                  "data/annotations/latest.tsv"
12 |   gold:                  "data/naacl2013/naacl2013.gold.tsv"
13 |   pool-depth:            100
14 |   run-depth:             1000
15 |   subsample:             "data/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt"
16 | }


--------------------------------------------------------------------------------
/conf/mf-bpr.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              true
 8 | 
 9 |   subsample:        1.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "mf"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval.conf"
61 | 


--------------------------------------------------------------------------------
/conf/mf-oc.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:              100
 3 |   lambda:         0.1
 4 |   alpha:          0.1
 5 |   maxIter:        1
 6 | 
 7 |   subsample:      1.0
 8 |   negPerPos:      1
 9 |   unobservedPerF: 1
10 | 
11 |   cellWeight:     1.0
12 |   formulaeWeight: 1.0
13 | 
14 |   gamma:          0 //0.01 //0.01
15 | 
16 |   batchTraining:  false
17 |   //optimizer:      "SGD"
18 |   optimizer:      "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   use-features:   false
24 | 
25 |   debug:          true
26 | 
27 |   outFile:        "predict.txt"
28 | }
29 | 
30 | //formulaeFile:      "data/formulae/curated-50-100.txt"
31 | //formulaeFile:      "data/formulae/curated.txt"
32 | outDir:           "data/out"
33 | logFile:          "data/out/experiments.log"
34 | 


--------------------------------------------------------------------------------
/conf/mf.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        1.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "mf"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-Joint.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        1.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "low-rank-logic"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-MF.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        1.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "mf"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-Zero-Inf.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        0.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "inference-only"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval-subsample.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-Zero-Joint.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        0.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "low-rank-logic"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval-subsample.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-Zero-MF.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        0.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:           "mf"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval-subsample.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-Zero-Post.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        0.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:             "post-inference"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval-subsample.conf"
61 | 


--------------------------------------------------------------------------------
/conf/naacl2015-Zero-Pre.conf:
--------------------------------------------------------------------------------
 1 | mf {
 2 |   k:                100
 3 |   lambda:           0.01
 4 |   alpha:            0.1
 5 |   maxIter:          200
 6 | 
 7 |   bpr:              false
 8 | 
 9 |   subsample:        0.0
10 |   negPerPos:        1
11 |   unobservedPerF:   1
12 | 
13 |   cellWeight:       1.0
14 |   formulaeWeight:   1.0
15 | 
16 |   batchTraining:    false
17 |   //optimizer:      "SGD"
18 |   optimizer:        "AdaGrad"
19 |   //optimizer:      "AdaMira"
20 |   //optimizer:      "LBFGS"
21 |   //optimizer:      "AvgPerceptron"
22 | 
23 |   outFile:          "predict.txt"
24 | 
25 |   use-features:     false
26 | 
27 |   //formulaeStart:  0
28 |   //formulaeEnd:    10
29 |   minDataHint:      0.0
30 |   minMFHint:        0.0
31 |   minNumPremises:   0
32 |   onlyAnnotated:    true
33 |   inject-rows:      true
34 |   test-row-terms:   true
35 |   force-symmetry:   false
36 | 
37 |   mode:             "pre-inference"
38 | }
39 | 
40 | dataType:           "naacl"
41 | 
42 | naacl {
43 |   formulaeFile:     "data/formulae/filtered.txt"
44 | }
45 | 
46 | figer {
47 |   dataDir: "data/figer"
48 |   formulaeFile: "None"
49 |   prob-dev: 0.0
50 |   prob-neg-labels: 0.1
51 |   sample-train-entities: 0.01
52 |   sample-train-facts: 0.01
53 |   use-features: true
54 | }
55 | 
56 | outDir: "data/out"
57 | logFile: "data/out/experiments.log"
58 | serialize: true
59 | inputFile: "naacl2013.txt.zip"
60 | evalConf: "eval-subsample.conf"
61 | 


--------------------------------------------------------------------------------
/data/eval/Set1.plt:
--------------------------------------------------------------------------------
 1 | # line styles for ColorBrewer Set1
 2 | # for use with qualitative/categorical data
 3 | # provides 8 easy-to-name colors
 4 | # compatible with gnuplot >=4.2
 5 | # author: Anna Schneider
 6 | 
 7 | # line styles
 8 | set style line 1 lc rgb '#E41A1C' # red
 9 | set style line 2 lc rgb '#377EB8' # blue
10 | set style line 3 lc rgb '#4DAF4A' # green
11 | set style line 4 lc rgb '#984EA3' # purple
12 | set style line 5 lc rgb '#FF7F00' # orange
13 | set style line 6 lc rgb '#FFFF33' # yellow
14 | set style line 7 lc rgb '#A65628' # brown
15 | set style line 8 lc rgb '#F781BF' # pink
16 | 
17 | # palette
18 | set palette maxcolors 8
19 | set palette defined ( 0 '#E41A1C',\
20 |     	    	      1 '#377EB8',\
21 | 		      2 '#4DAF4A',\
22 | 		      3 '#984EA3',\
23 | 		      4 '#FF7F00',\
24 | 		      5 '#FFFF33',\
25 | 		      6 '#A65628',\
26 | 		      7 '#F781BF' )


--------------------------------------------------------------------------------
/data/eval/eval.gpl:
--------------------------------------------------------------------------------
 1 | # Chart settings
 2 | #set title "Injecting Logic into Synthetic Matrices"
 3 | set terminal dumb enhanced
 4 | #set term x11
 5 | #set termoption enhanced
 6 | set key right center
 7 | #set key width -1.5
 8 | set key spacing 1.1
 9 | set key box linewidth 3
10 | set border linewidth 3
11 | 
12 | #load 'Spectral.plt'
13 | load 'Set1.plt'
14 | 
15 | if (!exists("fileName")) fileName='experiments'
16 | 
17 | set style line 3 lc rgb '#4DAF4A' lt 6 lw 1
18 | set style line 7 lc rgb '#984EA3' lw 1
19 | set style line 12 lc rgb '#FFC020' lt 4 lw 1
20 | set style line 13 lc rgb '#bbbbbb' lt 1 lw 2
21 | set style line 14 lc rgb '#dddddd' lt 4 lw 1
22 | set grid xtics mxtics ytics mytics back ls 13, ls 14
23 | 
24 | 
25 | set yrange [0.0:0.625]
26 | set xrange [0.0:0.5]
27 | set ytics 0.0,0.2,1
28 | set xtics 0.0,0.1,1
29 | 
30 | set mytics 4
31 | set mxtics 4
32 | 
33 | set xlabel "Fraction of Freebase training facts"
34 | set ylabel "MAP"
35 | 
36 | 
37 | mf = "< grep \"mf\" ".fileName.".log"
38 | low_rank_logic = "< grep \"low-rank-logic\" ".fileName.".log"
39 | inference_only = "< grep \"inference-only\" ".fileName.".log"
40 | pre_inference = "< grep \"pre-inference\" ".fileName.".log"
41 | post_inference = "< grep \"post-inference\" ".fileName.".log"
42 | pre_post_inference = "< grep \"pre-post-inference\" ".fileName.".log"
43 | 
44 | #plot mf using 3:1 smooth unique with linespoints ls 1 linewidth 3 title "Matrix Factorization",\
45 |      low_rank_logic using 3:1 smooth unique with linespoints ls 2 linewidth 3 title "Joint Optimization",\
46 |      pre_inference using 3:1 smooth unique with linespoints ls 4 linewidth 3 title "Pre-Inference",\
47 |      post_inference using 3:1 smooth unique with linespoints ls 5 linewidth 3 title "Post-Inference",\
48 |      inference_only using 3:1 smooth unique with linespoints ls 3 linewidth 3 title "Logical Inference",\
49 | 
50 | # Wrapup
51 | #set terminal pdf enhanced dashed size 2.75,2 #1.75 #size 400,400
52 | #set terminal png truecolor size 500,500
53 | #set output fileName."-MAP.png"
54 | #set output fileName."-MAP.pdf"
55 | #refresh
56 | #unset output
57 | 
58 | #set terminal dumb enhanced
59 | 
60 | #set object 1 rect from 0,0.3 to 0.4,0.6 lw 6 fs empty border lc rgb 'gold'
61 | 
62 | plot mf using 3:2 smooth unique with linespoints ls 1 linewidth 3 title "MF",\
63 |      low_rank_logic using 3:2 smooth unique with linespoints ls 2 linewidth 3 title "Joint",\
64 |      pre_inference using 3:2 smooth unique with linespoints ls 12 linewidth 3 title "Pre",\
65 |      post_inference using 3:2 smooth unique with linespoints ls 3 linewidth 3 title "Post",\
66 |      inference_only using 3:2 smooth unique with linespoints ls 7 linewidth 3 title "Inf"
67 | 
68 | 
69 | # Wrapup
70 | set ylabel "wMAP"
71 | set terminal pdf enhanced dashed size 2.75,1.85#,1.85 #1.75 #size 400,400
72 | set output fileName."-wMAP.pdf"
73 | #set terminal png truecolor size 500,500
74 | #set output fileName."-wMAP.png"
75 | refresh
76 | unset output


--------------------------------------------------------------------------------
/data/eval/hist.gpl:
--------------------------------------------------------------------------------
 1 | set terminal dumb enhanced
 2 | #unset key
 3 | 
 4 | binwidth=0.01
 5 | bin(x,width)=width*floor(x/width)
 6 | 
 7 | set xlabel "Distribution over length differences of implications"
 8 | 
 9 | 
10 | if (!exists("fileName")) fileName='lengths'
11 | 
12 | mf_formulae = "< grep \"mf-formulae\" ".fileName.".txt"
13 | mf_sample = "< grep \"mf-sample\" ".fileName.".txt"
14 | joint_formulae = "< grep \"joint-formulae\" ".fileName.".txt"
15 | pre_formulae = "< grep \"pre-formulae\" ".fileName.".txt"
16 | 
17 | plot mf_sample using (bin($1,binwidth)):(1.0) smooth freq with boxes title "mf-sample",\
18 |      mf_formulae using (bin($1,binwidth)):(1.0) smooth freq with boxes title "mf-formulae",\
19 |      pre_formulae using (bin($1,binwidth)):(1.0) smooth freq with boxes title "pre-formulae",\
20 |      joint_formulae using (bin($1,binwidth)):(1.0) smooth freq with boxes title "joint-formulae",\
21 | 
22 | 
23 | # Wrapup
24 | set terminal pdf enhanced size 2.75,2 #1.75 #size 400,400
25 | #set terminal png truecolor size 500,500
26 | #set output fileName.".png"
27 | set output fileName.".pdf"
28 | refresh
29 | unset output


--------------------------------------------------------------------------------
/data/eval/results.gpl:
--------------------------------------------------------------------------------
  1 | # Chart settings
  2 | #set title "Averaged 11-point Precision/Recall"
  3 | set key right top
  4 | set key width 0.5
  5 | set key box
  6 | 
  7 | load 'Set1.plt'
  8 | #load 'Spectral.plt'
  9 | 
 10 | 
 11 | set terminal pdf enhanced linewidth 3.0  size 3.000000,2.00000
 12 | set output "results.pdf"
 13 | # XYChart settings
 14 | set nologscale
 15 | 
 16 | 
 17 | #set style line 12 lc rgb '#bbbbbb' lt 1 lw 1
 18 | #set style line 13 lc rgb '#cccccc' lt 4 lw 1
 19 | #set grid xtics mxtics ytics mytics back ls 12, ls 13
 20 | 
 21 | 
 22 | set yrange [0.0:1]
 23 | set xrange [0.0:1]
 24 | set ytics 0.2,0.2,1
 25 | set xtics 0.0,0.1,1
 26 | 
 27 | #set mytics 2
 28 | #set mxtics 2
 29 | 
 30 | 
 31 | set xr [0.0:1.0] noreverse
 32 | set yr [0.0:1.0] noreverse
 33 | set xlabel "Recall"
 34 | set ylabel "Precision"
 35 | 
 36 | 
 37 | # XYData Plotting
 38 | plot \
 39 | '-'  using 1:2 title "Joint" with linespoints ls 2, \
 40 | '-'  using 1:2 title "MF" with linespoints ls 1, \
 41 | '-'  using 1:2 title "Mintz09" with linespoints  ls 12, \
 42 | '-'  using 1:2 title "Yao11" with linespoints  ls 4, \
 43 | '-'  using 1:2 title "Surdeanu12" with linespoints  ls 3, \
 44 | '-'  using 1:2 title "Riedel13-F" with linespoints  ls 9
 45 | # Joint
 46 | 0.0	1.0
 47 | 0.1	0.9561403508771931
 48 | 0.2	0.9175637958532697
 49 | 0.3	0.8734179657354529
 50 | 0.4	0.847352784814085
 51 | 0.5	0.8281385859497583
 52 | 0.6	0.6280542866809915
 53 | 0.7	0.5553527881495134
 54 | 0.8	0.4954569942312183
 55 | 0.9	0.35796469257369756
 56 | 1.0	0.273855414645719
 57 | end
 58 | # MF
 59 | 0.0	1.0
 60 | 0.1	0.9236842105263157
 61 | 0.2	0.8796321945213911
 62 | 0.3	0.8316317108560876
 63 | 0.4	0.8000582913269824
 64 | 0.5	0.7826510614100983
 65 | 0.6	0.5984916612797895
 66 | 0.7	0.5377585005461876
 67 | 0.8	0.47144152503664516
 68 | 0.9	0.33738750278770263
 69 | 1.0	0.25211611853430466
 70 | end
 71 | # Mintz09
 72 | 0.0	1.0
 73 | 0.1	0.5680898049319102
 74 | 0.2	0.49888269625111725
 75 | 0.3	0.39253056884635834
 76 | 0.4	0.34616228070175437
 77 | 0.5	0.330326960694419
 78 | 0.6	0.2625455040315721
 79 | 0.7	0.2544227428517771
 80 | 0.8	0.20727745289148797
 81 | 0.9	0.1111017092570655
 82 | 1.0	0.10526315789473684
 83 | end
 84 | # Yao11
 85 | 0.0	1.0
 86 | 0.1	0.7298830409356726
 87 | 0.2	0.6976507005539263
 88 | 0.3	0.6054098105574357
 89 | 0.4	0.5561356142314963
 90 | 0.5	0.5196400928792569
 91 | 0.6	0.3274135882806314
 92 | 0.7	0.3046111421649812
 93 | 0.8	0.2690176216877453
 94 | 0.9	0.12848082810905379
 95 | 1.0	0.11842105263157894
 96 | end
 97 | # Surdeanu12
 98 | 0.0	1.0
 99 | 0.1	0.7976055002370792
100 | 0.2	0.7603247984826932
101 | 0.3	0.7165779313147734
102 | 0.4	0.692342471423568
103 | 0.5	0.6831544034214891
104 | 0.6	0.5072457004580417
105 | 0.7	0.4886786729963325
106 | 0.8	0.4152355565638234
107 | 0.9	0.30327260458839406
108 | 1.0	0.23157894736842108
109 | end
110 | # Riedel13-F
111 | 0.0	1.0
112 | 0.1	0.8756827967354283
113 | 0.2	0.8394421952316689
114 | 0.3	0.7904526759789917
115 | 0.4	0.7538715117662486
116 | 0.5	0.7354230316280177
117 | 0.6	0.5990688237411651
118 | 0.7	0.5388328295170529
119 | 0.8	0.488869738535952
120 | 0.9	0.3873569895215238
121 | 1.0	0.2823195983625368
122 | end
123 | 
124 | unset output
125 | # Wrapup
126 | set terminal dumb
127 | refresh
128 | 


--------------------------------------------------------------------------------
/data/formulae/filtered.txt:
--------------------------------------------------------------------------------
  1 | //1	0.97	0.96	27	curated
  2 | path#nn|<-nn<-unit->prep->of->pobj->|pobj:INV => REL$/organization/parent/child
  3 | 
  4 | //2	0.97	1.00	22	curated
  5 | path#appos|->appos->subsidiary->prep->of->pobj->|pobj:INV => REL$/organization/parent/child
  6 | 
  7 | //3	0.97	0.88	17	curated
  8 | path#rcmod|->rcmod->own->prep->by->pobj->|pobj:INV => REL$/organization/parent/child
  9 | 
 10 | //4	0.97	1.00	26	curated
 11 | path#nn|<-nn<-city->prep->of->pobj->|pobj:INV => REL$/location/location/containedby
 12 | 
 13 | //5	0.97	1.00	11	curated
 14 | path#appos|->appos->subsidiary->nn->|nn:INV => REL$/organization/parent/child
 15 | 
 16 | //6	0.97	0.97	100	curated
 17 | path#poss|<-poss<-minister->appos->|appos:INV => REL$/people/person/nationality
 18 | 
 19 | //7	0.97	0.96	50	curated
 20 | path#appos|->appos->unit->prep->of->pobj->|pobj:INV => REL$/organization/parent/child
 21 | 
 22 | //8	0.96	0.96	25	curated
 23 | path#appos|->appos->division->prep->of->pobj->|pobj:INV => REL$/organization/parent/child
 24 | 
 25 | //9	0.96	1.00	36	curated
 26 | path#poss|<-poss<-executive->appos->|appos:INV => REL$/business/person/company
 27 | 
 28 | //10	0.96	1.00	11	curated
 29 | path#appos|->appos->co-founder->prep->of->pobj->|pobj:INV => REL$/business/company/founders
 30 | 
 31 | //11	0.96	0.92	59	curated
 32 | path#dobj|<-dobj<-review->prep->by->pobj->|pobj:INV => REL$/book/author/works_written
 33 | 
 34 | //12	0.95	0.85	27	curated
 35 | path#appos|->appos->founder->prep->of->pobj->|pobj:INV => REL$/business/company/founders
 36 | 
 37 | //13	0.95	0.94	89	curated
 38 | path#nn|<-nn<-town->prep->of->pobj->|pobj:INV => REL$/location/location/containedby
 39 | 
 40 | //14	0.95	0.53	40	curated
 41 | path#nn|<-nn<-neighborhood->prep->of->pobj->|pobj:INV => REL$/location/neighborhood/neighborhood_of
 42 | 
 43 | //15	0.95	0.82	28	curated
 44 | path#appos|->appos->director->dep->|dep:INV => REL$/film/film/directed_by
 45 | 
 46 | //16	0.95	0.92	13	curated
 47 | path#poss|<-poss<-region->nn->|nn:INV => REL$/location/location/containedby
 48 | 
 49 | //17	0.94	0.92	13	curated
 50 | path#appos|->appos->producer->dep->|dep:INV => REL$/film/film/produced_by
 51 | 
 52 | //18	0.94	0.62	47	curated
 53 | path#poss|<-poss<-film->dep->|dep:INV => REL$/film/film/directed_by
 54 | 
 55 | //19	0.94	0.97	29	curated
 56 | path#nsubj|<-nsubj<-professor->prep->at->pobj->|pobj:INV => REL$/location/location/containedby
 57 | 
 58 | //20	0.94	0.58	24	curated
 59 | path#poss|<-poss<-movie->dep->|dep:INV => REL$/film/film/directed_by
 60 | 
 61 | //21	0.93	0.80	15	curated
 62 | path#poss|<-poss<-leader->appos->|appos:INV => REL$/people/person/nationality
 63 | 
 64 | //22	0.93	0.63	16	curated
 65 | path#nn|<-nn<-film->dep->|dep:INV => REL$/film/film/directed_by
 66 | 
 67 | //23	0.93	0.85	20	curated
 68 | path#nn|<-nn<-suburb->prep->of->pobj->|pobj:INV => REL$/location/location/containedby
 69 | 
 70 | //24	0.93	0.67	39	curated
 71 | path#appos|->appos->daughter->prep->of->pobj->|pobj => REL$/people/person/parents
 72 | 
 73 | //25	0.93	0.87	15	curated
 74 | path#poss|<-poss<-chairman->appos->|appos:INV => REL$/business/person/company
 75 | 
 76 | //26	0.93	0.92	12	curated
 77 | path#nn|<-nn<-side->prep->of->pobj->|pobj:INV => REL$/location/location/containedby
 78 | 
 79 | //27	0.93	0.64	216	curated
 80 | path#nsubj|<-nsubj<-die->prep->in->pobj->|pobj => REL$/people/deceased_person/place_of_death
 81 | 
 82 | //28	0.93	0.53	40	curated
 83 | path#poss|<-poss<-neighborhood->nn->|nn:INV => REL$/location/neighborhood/neighborhood_of
 84 | 
 85 | //29	0.91	0.93	15	curated
 86 | path#nsubj|<-nsubj<-professor->appos->|appos:INV => REL$/location/location/containedby
 87 | 
 88 | //30	0.91	0.76	33	curated
 89 | path#nsubj|<-nsubj<-die->prep->at->pobj->hospital->prep->in->pobj->|pobj => REL$/people/deceased_person/place_of_death
 90 | 
 91 | //31	0.91	0.47	45	curated
 92 | path#poss|<-poss<-book->dep->|dep => REL$/book/author/works_written
 93 | 
 94 | //32	0.90	0.87	15	curated
 95 | path#nsubj|<-nsubj<-name->dobj->|dobj:INV => REL$/business/person/company
 96 | 
 97 | //33	0.90	0.51	241	curated
 98 | path#nsubjpass|<-nsubjpass<-bear->prep->in->pobj->|pobj => REL$/people/person/place_of_birth
 99 | 
100 | //34	0.90	0.48	153	curated
101 | path#appos|->appos->minister->poss->|poss => REL$/people/person/nationality
102 | 
103 | //35	0.88	0.70	83	curated
104 | path#appos|->appos->capital->prep->of->pobj->|pobj => REL$/location/location/containedby
105 | 
106 | //36	0.87	0.68	19	curated
107 | path#nsubj|<-nsubj<-city->prep->in->pobj->|pobj => REL$/location/location/containedby


--------------------------------------------------------------------------------
/data/naacl2013/structured/eval-naacl13-structured.out.txt:
--------------------------------------------------------------------------------
 1 | Reading in annotations...
 2 | Collecting facts from rank files
 3 | Loading Annotations
 4 | Loading Rank Files
 5 | Latex:
 6 | \begin{center}
 7 | \begin{tabular}{ l l | c c c c c c c }
 8 |               Relation & \# & MI09 & YA11 & SU12 & N & F & NF & NFE \\
 9 | \hline
10 |         person/company &  103 &   0.67 &   0.64 &   0.70 &   0.73 &   0.75 &   0.76 & {\bf   0.79} \\
11 |   location/containedby &   74 &   0.48 &   0.51 &   0.54 &   0.43 &   0.68 &   0.67 & {\bf   0.69} \\
12 |     person/nationality &   29 &   0.13 & {\bf   0.39} &   0.12 &   0.14 &   0.18 &   0.18 &   0.21 \\
13 |   author/works\_written &   29 &   0.50 &   0.51 &   0.52 &   0.45 &   0.61 &   0.63 & {\bf   0.69} \\
14 |           parent/child &   19 &   0.14 &   0.25 &   0.62 &   0.46 &   0.76 & {\bf   0.78} &   0.76 \\
15 |   person/place\_of\_death &   19 &   0.79 &   0.79 &   0.86 & {\bf   0.89} &   0.83 &   0.85 &   0.86 \\
16 |   person/place\_of\_birth &   18 &   0.78 &   0.75 &   0.82 &   0.50 &   0.83 &   0.81 & {\bf   0.89} \\
17 |   neighborhood/neighborhood\_of &   12 &   0.00 &   0.00 &   0.08 &   0.43 &   0.65 &   0.66 & {\bf   0.72} \\
18 |         person/parents &    7 &   0.24 &   0.27 &   0.58 &   0.56 &   0.53 & {\bf   0.58} &   0.39 \\
19 |       company/founders &    4 &   0.25 &   0.25 &   0.53 &   0.24 &   0.77 & {\bf   0.80} &   0.68 \\
20 |      film/directed\_by &    4 &   0.06 &   0.15 &   0.25 &   0.09 &   0.26 &   0.26 & {\bf   0.30} \\
21 |    sports\_team/league &    4 &   0.00 &   0.43 &   0.18 &   0.21 &   0.59 & {\bf   0.70} &   0.63 \\
22 |    team/arena\_stadium &    3 &   0.00 &   0.06 &   0.06 &   0.03 &   0.08 & {\bf   0.09} &   0.08 \\
23 |   team\_owner/teams\_owned &    2 &   0.00 &   0.50 &   0.70 &   0.55 &   0.38 &   0.61 & {\bf   0.75} \\
24 |   roadcast/area\_served &    2 & {\em   1.00} &   0.50 & {\em   1.00} &   0.58 &   0.58 &   0.83 & {\em   1.00} \\
25 |    structure/architect &    2 &   0.00 &   0.00 & {\em   1.00} &   0.27 & {\em   1.00} & {\em   1.00} & {\em   1.00} \\
26 |   composer/compositions &    2 &   0.00 &   0.00 &   0.00 &   0.50 &   0.67 & {\bf   0.83} &   0.12 \\
27 |        person/religion &    1 &   0.00 & {\em   1.00} & {\em   1.00} &   0.50 & {\em   1.00} & {\em   1.00} & {\em   1.00} \\
28 |      film/produced\_by &    1 & {\em   1.00} & {\em   1.00} & {\em   1.00} & {\em   1.00} &   0.50 &   0.50 &   0.33 \\
29 | \hline
30 |                    MAP &      &   0.32 &   0.42 &   0.56 &   0.45 &   0.61 &   0.66 &   0.63 \\
31 |           Weighted MAP &      &   0.48 &   0.52 &   0.57 &   0.52 &   0.66 &   0.67 &   0.69 \\
32 | \end{tabular}
33 | \end{center}
34 | Summary:
35 | Pattern                       Gold+     Gold+-    | MAP       Missing   | MAP       Missing   | MAP       Missing   | MAP       Missing   | MAP       Missing   | MAP       Missing   | MAP       Missing   
36 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
37 | person/company$               103       166       | 0.67      0         | 0.64      0         | 0.70      0         | 0.73      0         | 0.75      0         | 0.76      0         | 0.79      0         
38 | location/containedby$         74        235       | 0.48      0         | 0.51      0         | 0.54      0         | 0.43      0         | 0.68      0         | 0.67      0         | 0.69      0         
39 | person/nationality$           29        278       | 0.13      0         | 0.39      0         | 0.12      0         | 0.14      0         | 0.18      0         | 0.18      0         | 0.21      0         
40 | author/works_written$         29        280       | 0.50      0         | 0.51      0         | 0.52      0         | 0.45      0         | 0.61      0         | 0.63      0         | 0.69      0         
41 | parent/child$                 19        256       | 0.14      0         | 0.25      0         | 0.62      0         | 0.46      0         | 0.76      0         | 0.78      0         | 0.76      0         
42 | person/place_of_death$        19        221       | 0.79      0         | 0.79      0         | 0.86      0         | 0.89      0         | 0.83      0         | 0.85      0         | 0.86      0         
43 | person/place_of_birth$        18        216       | 0.78      0         | 0.75      0         | 0.82      0         | 0.50      0         | 0.83      0         | 0.81      0         | 0.89      0         
44 | neighborhood/neighborhood_of  12        245       | 0.00      -1        | 0.00      -1        | 0.08      0         | 0.43      0         | 0.65      0         | 0.66      0         | 0.72      0         
45 | person/parents$               7         279       | 0.24      0         | 0.27      0         | 0.58      0         | 0.56      0         | 0.53      0         | 0.58      0         | 0.39      0         
46 | company/founders$             4         294       | 0.25      0         | 0.25      0         | 0.53      0         | 0.24      0         | 0.77      0         | 0.80      0         | 0.68      0         
47 | film/directed_by              4         305       | 0.06      0         | 0.15      0         | 0.25      0         | 0.09      0         | 0.26      0         | 0.26      0         | 0.30      0         
48 | sports_team/league$           4         293       | 0.00      -1        | 0.43      0         | 0.18      0         | 0.21      0         | 0.59      0         | 0.70      0         | 0.63      0         
49 | team/arena_stadium$           3         220       | 0.00      0         | 0.06      0         | 0.06      0         | 0.03      0         | 0.08      0         | 0.09      0         | 0.08      0         
50 | team_owner/teams_owned$       2         229       | 0.00      0         | 0.50      0         | 0.70      0         | 0.55      0         | 0.38      0         | 0.61      0         | 0.75      0         
51 | roadcast/area_served$         2         297       | 1.00      0         | 0.50      0         | 1.00      0         | 0.58      0         | 0.58      0         | 0.83      0         | 1.00      0         
52 | structure/architect$          2         286       | 0.00      -1        | 0.00      -1        | 1.00      0         | 0.27      0         | 1.00      0         | 1.00      0         | 1.00      0         
53 | composer/compositions$        2         297       | 0.00      -1        | 0.00      -1        | 0.00      0         | 0.50      0         | 0.67      0         | 0.83      0         | 0.12      0         
54 | person/religion$              1         271       | 0.00      -1        | 1.00      0         | 1.00      0         | 0.50      0         | 1.00      0         | 1.00      0         | 1.00      0         
55 | film/produced_by$             1         291       | 1.00      0         | 1.00      0         | 1.00      0         | 1.00      0         | 0.50      0         | 0.50      0         | 0.33      0         
56 | Average                       0         0         | 0.32      -1        | 0.42      -1        | 0.56      -1        | 0.45      -1        | 0.61      -1        | 0.66      -1        | 0.63      -1        
57 | Global                        0         0         | 0.48      -1        | 0.52      -1        | 0.57      -1        | 0.52      -1        | 0.66      -1        | 0.67      -1        | 0.69      -1        
58 | name         MI09         YA11         SU12         N            F            NF           NFE          
59 | MI09                       3/10 0.092   1/15 0.001   5/13 0.096   2/17 0.001   2/17 0.001   1/17 0.000  
60 | MI09                       3/10 0.092   1/15 0.001   5/13 0.096   2/17 0.001   2/17 0.001   1/17 0.000  
61 | MI09                       3/10 0.092   1/15 0.001   5/13 0.096   2/17 0.001   2/17 0.001   1/17 0.000  
62 | MI09                       3/10 0.092   1/15 0.001   5/13 0.096   2/17 0.001   2/17 0.001   1/17 0.000  
63 | MI09                       3/10 0.092   1/15 0.001   5/13 0.096   2/17 0.001   2/17 0.001   1/17 0.000  
64 | MI09                       3/10 0.092   1/15 0.001   5/13 0.096   2/17 0.001   2/17 0.001   1/17 0.000  
65 | YA11                                    2/13 0.007   9/ 9 1.185   3/15 0.008   2/16 0.001   2/16 0.001  
66 | YA11                                    2/13 0.007   9/ 9 1.185   3/15 0.008   2/16 0.001   2/16 0.001  
67 | YA11                                    2/13 0.007   9/ 9 1.185   3/15 0.008   2/16 0.001   2/16 0.001  
68 | YA11                                    2/13 0.007   9/ 9 1.185   3/15 0.008   2/16 0.001   2/16 0.001  
69 | YA11                                    2/13 0.007   9/ 9 1.185   3/15 0.008   2/16 0.001   2/16 0.001  
70 | SU12                                                12/ 6 0.238   5/12 0.143   5/12 0.143   2/14 0.004  
71 | SU12                                                12/ 6 0.238   5/12 0.143   5/12 0.143   2/14 0.004  
72 | SU12                                                12/ 6 0.238   5/12 0.143   5/12 0.143   2/14 0.004  
73 | SU12                                                12/ 6 0.238   5/12 0.143   5/12 0.143   2/14 0.004  
74 | N                                                                 4/14 0.031   2/17 0.001   4/15 0.019  
75 | N                                                                 4/14 0.031   2/17 0.001   4/15 0.019  
76 | N                                                                 4/14 0.031   2/17 0.001   4/15 0.019  
77 | F                                                                              3/13 0.021   4/12 0.077  
78 | F                                                                              3/13 0.021   4/12 0.077  
79 | NF                                                                                          7/10 0.629  
80 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uclnlp/low-rank-logic/2c5686eda9e0c0c389ede5c6e4eea885d14e947c/overview.png


--------------------------------------------------------------------------------
/project/Build.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | import Keys._
 3 | 
 4 | object Build extends Build {
 5 |   val Organization = "uclmr"
 6 |   val Name = "low-rank-logic"
 7 |   val Version = "0.1.0-SNAPSHOT"
 8 |   val ScalaVersion = "2.10.4"
 9 | 
10 | 
11 |   lazy val wolfeCore = ProjectRef(file("./wolfe"), "wolfe-core")
12 |   lazy val wolfeUtil = ProjectRef(file("./wolfe"), "wolfe-util")
13 | 
14 |   lazy val root = Project(
15 |     "low-rank-logic",
16 |     file("."),
17 |     settings = Defaults.defaultSettings ++ Seq(
18 |       organization := Organization,
19 |       name := Name,
20 |       version := Version,
21 |       scalaVersion := ScalaVersion,
22 |       libraryDependencies ++= Seq(
23 |         "net.sandrogrzicic" %% "scalabuff-compiler" % "1.3.6",
24 |         "net.sandrogrzicic" %% "scalabuff-runtime" % "1.3.6",
25 |         "com.google.protobuf" % "protobuf-java" % "2.3.0"
26 |       ),
27 |       commands ++= Seq(vmargs),
28 |       fork in run := true //use a fresh JVM for sbt run
29 |     )
30 |   ) dependsOn (
31 |     wolfeCore % "test->test;compile->compile",
32 |     wolfeUtil % "test->test;compile->compile"//,
33 |   )
34 | 
35 |   //utility methods
36 |   def vmargs = Command.args("vmargs", "<name>") {
37 |     (state, args) =>
38 |       val javaRunOptions = args.mkString(" ")
39 |       println("Applying JVM arguments: " + javaRunOptions)
40 |       Project.extract(state).append(javaOptions := Seq(javaRunOptions), state)
41 |   }
42 | }


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.mojolly.scalate" % "xsbt-scalate-generator" % "0.5.0")
2 | 
3 | addSbtPlugin("org.scalatra.sbt" % "scalatra-sbt" % "0.3.5")


--------------------------------------------------------------------------------
/src/main/scala/uclmr/AnnotationTool.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import java.io.{FileInputStream, File, PrintStream, InputStream}
  4 | import java.text.SimpleDateFormat
  5 | import java.util.Calendar
  6 | 
  7 | import scala.collection.mutable
  8 | import scala.collection.mutable.{HashSet, HashMap}
  9 | import scala.io.Source
 10 | 
 11 | /**
 12 |  * @author Sebastian Riedel
 13 |  */
 14 | object AnnotationTool {
 15 | 
 16 |   import ml.wolfe.util.ANSIFormatter.ANSIString
 17 | 
 18 |   case class Annotation(tuple: Seq[Any], label: String, correct: Boolean) {
 19 |     override def toString = (Seq(if (correct) "1" else "0", label) ++ tuple).mkString("\t")
 20 | 
 21 |     def fact = tuple -> label
 22 |   }
 23 | 
 24 |   def loadAnnotations(in: InputStream, out: Option[PrintStream] = None) = {
 25 |     println("Reading in annotations...")
 26 |     val result = new mutable.HashMap[(Seq[Any], String), Annotation]()
 27 |     for (line <- Source.fromInputStream(in).getLines()) {
 28 |       val fields = line.split("\\t")
 29 |       val correct = fields(0) == "1"
 30 |       val label = fields(1)
 31 |       val tuple = fields.drop(2).toSeq
 32 |       result(Tuple2(tuple,label)) = Annotation(tuple, label, correct)
 33 |       for (o <- out) o.println(line)
 34 |     }
 35 |     result
 36 |   }
 37 | 
 38 |   def loadMentions(mentionFileName: String) = {
 39 |     val pair2sen = new HashMap[Seq[Any], HashSet[String]] // arg1 -> rel arg1 arg2
 40 |     val source = Source.fromFile(mentionFileName,"ISO-8859-1")
 41 |     println("Loading mention file...")
 42 |     for (line <- source.getLines(); if (!line.startsWith("#Document"))) {
 43 |       val fields = line.split("\t")
 44 |       val sen = fields(fields.length - 1)
 45 |       val sens = pair2sen.getOrElseUpdate(Seq(fields(1), fields(2)), new HashSet[String])
 46 |       sens += sen
 47 |     }
 48 |     source.close()
 49 |     pair2sen
 50 |   }
 51 | 
 52 | 
 53 |   def main(args: Array[String]) {
 54 |     val sourceName = args(0)
 55 |     val projDirName = args(1)
 56 |     val mentionFileName = args(2)
 57 |     //val pattern = args.lift(3).getOrElse("").r
 58 |     val pattern = if (args.length > 3) args(3) else ".*"
 59 |     val previousFileName = args.lift(4).getOrElse("latest.tsv")
 60 |     println(previousFileName)
 61 |     val newFileName = args.lift(5).getOrElse({
 62 |       val cal = Calendar.getInstance()
 63 |       val sdf = new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss")
 64 |       sdf.format(cal.getTime) + ".tsv"
 65 |     })
 66 |     val projDir = new File(projDirName)
 67 |     projDir.mkdirs()
 68 | 
 69 |     val sourceFile = new File(sourceName)
 70 |     val previousFile = new File(projDir, previousFileName)
 71 |     val newFile = new File(projDir, newFileName)
 72 |     val out = new PrintStream(newFile)
 73 | 
 74 |     //read in mention file
 75 | 
 76 |     val pair2sen = loadMentions(mentionFileName)
 77 | 
 78 |     //read in previous file if exists
 79 |     //Format: Tuple, System,
 80 |     val annotations = if (previousFile.exists())
 81 |       loadAnnotations(new FileInputStream(previousFile), Some(out))
 82 |     else
 83 |       new mutable.HashMap[(Seq[Any], String), Annotation]
 84 |     println("Previous Annotations: " + annotations.size)
 85 | 
 86 |     //set up new softlink
 87 |     setupSoftlink(new File(projDir, "latest.tsv"), newFile)
 88 | 
 89 |     var labelled = 0
 90 | 
 91 |     //go through ranked file, and find tuples not yet annotated
 92 |     for (line <- Source.fromFile(sourceFile).getLines()) {
 93 |       val lineTmp = line.split("\\|").mkString("\t")
 94 |       val Array(score, arg1, arg2, freebase, predicted) = lineTmp.split("\\t")
 95 |       //if (pattern.contains(predicted)) {
 96 |       //if (predicted.contains(pattern)) {
 97 |       if (predicted matches pattern) {
 98 |         val tuple = Seq(arg1, arg2)
 99 |         annotations.get(Tuple2(tuple, predicted)) match {
100 |           case None =>
101 |             //get sentences
102 |             val sentences = pair2sen.getOrElse(tuple, Set.empty)
103 |             //ask user
104 |             println("*************************************************")
105 |             println("Asking for annotation of: " + tuple.mkString(" | "))
106 |             println("Number of annotations:    " + labelled)
107 |             println("Prediction:               " + predicted)
108 |             println("Score:                    " + score)
109 |             println("Freebase:                 " + freebase)
110 |             println("Sentences: ")
111 |             for (sentence <- sentences) {
112 |               var current:String = sentence
113 |               var first = true
114 |               for (arg <- tuple) {
115 |                 def render = if (first) arg.toUpperCase.onBlue() else arg.toUpperCase.onRed()
116 |                 if (current.contains(arg)) {
117 |                   current = current.replaceAll(arg, if (first) arg.onBlue() else arg.onRed())
118 |                 } else if (current.contains(arg.toUpperCase)) {
119 |                   current = current.replaceAll(arg.toUpperCase, render)
120 |                 } else if (current.contains(arg.toLowerCase)) {
121 |                   current = current.replaceAll(arg.toLowerCase, render)
122 |                 }
123 |                 first = false
124 |               }
125 |               println("   " + current)
126 |             }
127 |             println("Correct (y/N)?: ")
128 |             val line = readLine()
129 |             val correct = line.trim.toLowerCase == "y"
130 |             val annotation = Annotation(tuple, predicted, correct)
131 |             out.println(annotation)
132 |             out.flush()
133 | 
134 |           case Some(annotation) => //println(annotation)
135 |         }
136 |         labelled += 1
137 |       }
138 | 
139 |     }
140 | 
141 | 
142 |   }
143 | 
144 |   def setupSoftlink(latest: File, newFile: File) {
145 |     if (latest.exists()) {
146 |       //remove latest file, assuming it's a softlink
147 |       latest.delete()
148 |     }
149 |     import scala.sys.process._
150 | 
151 |     ("/bin/ln -s " + newFile.getAbsolutePath + " " + latest.getAbsolutePath).!!
152 | 
153 |     //Runtime.getRuntime.exec("/bin/ln -s %s %s".format(newFile.getAbsolutePath, latest.getAbsolutePath))
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/EmbeddedProbLogicEvaluation.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import java.io.File
  4 | 
  5 | import cc.factorie.la.DenseTensor1
  6 | import com.typesafe.config.ConfigFactory
  7 | import ml.wolfe.Wolfe._
  8 | import uclmr.FactorizationUtil.Row
  9 | 
 10 | import scala.collection.mutable
 11 | import scala.util.Random
 12 | 
 13 | /**
 14 |  * @author Sebastian Riedel
 15 |  */
 16 | object EmbeddedProbLogicEvaluation {
 17 | 
 18 | 
 19 |   def main(args: Array[String]) {
 20 |     implicit val conf = ConfigFactory.parseFile(new File("conf/epl.conf"))
 21 |     implicit val random = new Random(0)
 22 |     assert(!conf.entrySet().isEmpty, "Couldn't find configuration file.")
 23 | 
 24 |     def relationFilter(rel: String) = rel.startsWith("path") || (rel.startsWith("REL$") && rel != "REL$NA")
 25 | 
 26 |     //load raw data
 27 |     val trainRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.train")), relationFilter)
 28 |     val train = FactorizationUtil.filterRows(random.shuffle(trainRaw.toBuffer), conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols"))
 29 | 
 30 |     val unlabeledRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.unlabeled")), relationFilter, skipUnlabeled = true)
 31 |     val unlabeled = FactorizationUtil.filterRows(unlabeledRaw.toSeq, conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols"), !_.startsWith("REL$"))
 32 |     val combined = if (conf.getBoolean("epl.use-unlabeled")) train ++ unlabeled else train
 33 | 
 34 | 
 35 |     //relations
 36 |     val trainRelations = combined.flatMap(_.relations.map(_._1)).distinct.sorted // REL$/book/book_edition/author_editor
 37 |     val freebaseRelations = trainRelations.filter(_.startsWith("REL$")) //Seq("REL$/business/person/company")//
 38 |     val surfacePatterns = trainRelations.filterNot(_.startsWith("REL$")).toSet
 39 | 
 40 |     val testRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.test")), relationFilter,
 41 |       skipUnlabeled = true, minObsCount = 1).toSeq
 42 |     val test = FactorizationUtil.filterRows(testRaw, 1, 1, surfacePatterns)
 43 | 
 44 |     println(trainRelations.size)
 45 |     println(train.size)
 46 |     println(unlabeled.size)
 47 |     println(test.size)
 48 | 
 49 |     val priorRepulsion = conf.getDouble("epl.prior-repulsion")
 50 |     val priorCounts = Map((true, false) -> priorRepulsion, (false, true) -> priorRepulsion) withDefaultValue 0.0
 51 | 
 52 |     println("Extracting Binary rules")
 53 |     val trainRulesRaw =
 54 |       if (conf.getBoolean("epl.combine-datasets")) RuleLearner.learn(combined, priorCounts)
 55 |       else RuleLearner.learn(train, priorCounts) + RuleLearner.learn(unlabeled, priorCounts)
 56 | 
 57 |     val cooccurMin = conf.getDouble("epl.min-cooccur")
 58 | 
 59 |     println("Finding components")
 60 |     val connected = RuleFilters.connectedComponents(trainRulesRaw, cooccurMin)
 61 |     val connectedFreebase = connected.filter(c => freebaseRelations.exists(c._1.nodes))
 62 |     println(s"Connected Components: ${ connected.size }")
 63 |     println(s"Connected Components with freebase relations: ${ connectedFreebase.size }")
 64 |     println(s"Total count of rules in components: ${ connectedFreebase.view.map(_._2.rules2.size).sum }")
 65 |     FactorizationUtil.saveToFile(
 66 |       connected.map(_._1.nodes.toList.sorted.mkString("------\n  ", "\n  ", "")),
 67 |       new File("/tmp/components.txt"))
 68 | 
 69 | 
 70 |     val trainRulesBeforeSubsampling = Rules(connectedFreebase.map(_._2.rules2).reduce(_ ++ _)) //RuleFilters.keep2ndOrder(joinedRulesRaw, cooccurMin)
 71 |     val subsample = conf.getDouble("epl.subsample")
 72 |     val trainRules = Rules(trainRulesBeforeSubsampling.rules2.filter(p => p._2.cooccurCount >= 1 || random.nextDouble() < subsample))
 73 |     //val trainRulesFiltered = trainRules.copy(rules2 = trainRules.)
 74 | 
 75 |     println(s"Original rule count: ${ trainRulesRaw.rules2.size }")
 76 |     println(s"Filtered rule count: ${ trainRules.rules2.size }")
 77 | 
 78 | 
 79 |     FactorizationUtil.saveToFile(trainRules.rules1.values.toSeq.sortBy(_.rel).mkString("\n"), new File("/tmp/rules1.txt"))
 80 |     FactorizationUtil.saveToFile(trainRules.rules2.values.toSeq.sortBy(-_.probs(true, true)), new File("/tmp/rules2.txt"))
 81 | 
 82 | 
 83 |     println(s"Embedding ${ trainRules.rules2.size } rules")
 84 |     val ple = ProbLogicEmbedder.embed(trainRules)
 85 | 
 86 |     println("Prediction")
 87 |     val predictedRows = test map (row => ple.predictRow(row, freebaseRelations))
 88 |     val predictedFacts = FactorizationUtil.toRankedFacts(test zip predictedRows).filter(_.score > 0.0)
 89 | 
 90 |     println(predictedFacts.take(100).mkString("\n"))
 91 |     FactorizationUtil.saveForUSchemaEval(predictedFacts, new File("/tmp/ple.txt"))
 92 |     FactorizationUtil.saveToFile(predictedFacts.mkString("\n"), new File("/tmp/ranked.txt"))
 93 | 
 94 |     if (conf.getBoolean("epl.print-comparisons")) {
 95 |       println("Extracting learned rules")
 96 |       val learnedRules = ple.pairwiseRules(trainRules.rules2.keys)
 97 |       compareRules(learnedRules, trainRules.rules2)
 98 |     }
 99 | 
100 |   }
101 | 
102 |   def compareRules(rules2: Map[(String, String), Rule2], rules1: Map[(String, String), Rule2]) = {
103 |     val paired = for (r1 <- rules1.values; r2 <- rules2.get(r1.rel1 -> r1.rel2)) yield (r1, r2, r2.prob1given2Inc(r1))
104 |     val printPaired = paired.toSeq.sortBy(-_._3).view.map(t => s"Mismatch: ${ t._3 }\n${ t._1 }\n${ t._2 }")
105 |     val printPairedInv = paired.toSeq.sortBy(_._3).view.map(t => s"Mismatch: ${ t._3 }\n${ t._1 }\n${ t._2 }")
106 |     FactorizationUtil.saveToFile(printPaired, new File("/tmp/rule-comparisons.txt"))
107 |     FactorizationUtil.saveToFile(printPairedInv, new File("/tmp/rule-comparisons-inv.txt"))
108 | 
109 |     val avgCondMismatch = paired.view.map(t => math.abs(t._3)).sum / paired.size
110 |     println("Average cond. mismatch: " + avgCondMismatch)
111 | 
112 |   }
113 | 
114 | }
115 | 
116 | object EmbeddedProbLogicPlayground {
117 | 
118 |   import scala.math._
119 | 
120 |   def manualRules(): Unit = {
121 |     implicit val conf = ConfigFactory.parseFile(new File("conf/epl-synth.conf")) withFallback ConfigFactory.parseFile(new File("conf/epl.conf"))
122 |     implicit val random = new Random(0)
123 | 
124 | 
125 |     val test = FactorizationUtil.sampleRows(10, 10, 0.2)
126 |     val manualData = Seq(
127 |       Row("e1", "e2", Seq("r1", "r2").map(_ -> 1.0)),
128 |       Row("e3", "e4", Seq("r2", "r3").map(_ -> 1.0)),
129 |       Row("e5", "e6", Seq("r4", "r5").map(_ -> 1.0))
130 |     )
131 | 
132 |     val dataRelations = test.flatMap(_.observedTrue).distinct.sorted
133 | 
134 | 
135 |     val manualEmbeddings = ProbLogicEmbeddings(Map(
136 |       "r0" -> PredicateEmbedding("r0", new DenseTensor1(Array(1.0, 0.0)), 1.0, -2.0, 10.0),
137 |       "r1" -> PredicateEmbedding("r1", new DenseTensor1(Array(1.0 / sqrt(2), 1.0 / sqrt(2))), 1.0, 0.0, 1.0)
138 |     ))
139 |     val ple = manualEmbeddings //ProbLogicEmbedder.embed(manualRules).copy(average = false) //ProbLogicEmbedder.embed(randomRules)
140 | 
141 |     val rulesData = RuleLearner.learn(manualData)
142 | 
143 |     val pleData = ProbLogicEmbedder.embed(rulesData)
144 |     val learnedRules = pleData.pairwiseRules(rulesData.rules2.keys)
145 | 
146 |     EmbeddedProbLogicEvaluation.compareRules(learnedRules, rulesData.rules2)
147 | 
148 |     val predictionsData = for (row <- test) yield {
149 |       row.copy(relations = dataRelations.map(r => r -> pleData.predict(row.observedTrue, r)))
150 |     }
151 | 
152 |     println(FactorizationUtil.renderPredictions(predictionsData, test))
153 |     println(pleData.embeddings.values.mkString("\n"))
154 | 
155 |   }
156 | 
157 |   def main(args: Array[String]) {
158 |     manualRules()
159 |   }
160 | 
161 |   def rulesFromRandomData() {
162 |     implicit val conf = ConfigFactory.parseFile(new File("conf/epl-synth.conf")) withFallback ConfigFactory.parseFile(new File("conf/epl.conf"))
163 |     implicit val random = new Random(0)
164 | 
165 |     val randomRows = FactorizationUtil.sampleRows(10, 4, 0.2)
166 |     val randomRelations = randomRows.flatMap(_.relations.map(_._1)).distinct.sorted
167 |     val randomRules = RuleLearner.learn(randomRows)
168 | 
169 |     val ple = ProbLogicEmbedder.embed(randomRules)
170 | 
171 |     val predictions = for (row <- randomRows) yield {
172 |       row.copy(relations = randomRelations.map(r => r -> ple.predict(row.observedTrue, r)))
173 |     }
174 | 
175 |     println(randomRules)
176 | 
177 |     println(FactorizationUtil.renderPredictions(predictions, randomRows))
178 | 
179 |   }
180 | }
181 | 
182 | 
183 | object RuleFilters {
184 | 
185 |   import scala.math._
186 | 
187 |   def keep2ndOrder(rules: Rules,
188 |                    minCooccurCount: Double) = {
189 |     val filtered = rules.rules2.filter(_._2.cooccurCount >= minCooccurCount - 0.0001).map(p => p._1 -> p._2.count * p._2.probs(true, true))
190 |     val graph = filtered ++ filtered.map(p => p.copy(_1 = p._1.swap)) withDefaultValue 0.0
191 |     val arg1ToEdges = filtered.toList.groupBy(_._1._1) withDefaultValue Nil
192 |     val arg2ToEdges = filtered.toList.groupBy(_._1._2) withDefaultValue Nil
193 | 
194 |     def expand(graph: Map[(String, String), Double]) = {
195 |       //go over all edges (e1,e2) and connect e1 to e3 for each (e2,e3)
196 |       //todo: this doesn't find the highest scoring path though
197 |       val newEdges = for (((arg1, arg2), s1) <- graph;
198 |                           ((_, arg3), s2) <- arg1ToEdges(arg2)
199 |                           if arg3 != arg1 && !graph.contains((arg1, arg3))) yield (arg1, arg3) -> min(s1, s2)
200 |       graph ++ newEdges
201 |     }
202 |     val expanded = expand(graph)
203 |     rules.copy(rules2 = rules.rules2.filterKeys(expanded.contains))
204 |   }
205 | 
206 |   class Component(first: String) {
207 |     val edges = new mutable.HashSet[(String, String)]
208 |     val nodes = new mutable.HashSet[String]
209 |     nodes += first
210 |   }
211 | 
212 |   def connectedComponents(rules: Rules, minCooccurCount: Double, filter: String => Boolean = _ => true) = {
213 |     val filtered = rules.rules2.filter(_._2.cooccurCount >= minCooccurCount - 0.0001).map(p => p._1 -> p._2.count * p._2.probs(true, true))
214 |     val graph = filtered ++ filtered.map(p => p.copy(_1 = p._1.swap)) withDefaultValue 0.0
215 | 
216 |     val components = new mutable.HashMap[String, Component]()
217 |     for ((a1, a2) <- graph.keys) {
218 |       val c1 = components.getOrElseUpdate(a1, new Component(a1))
219 |       val c2 = components.getOrElseUpdate(a2, new Component(a2))
220 |       if (c1 == c2) {
221 |         c1.edges += ((a1, a2))
222 |       } else {
223 |         val (keep, discard) = if (c1.nodes.size > c2.nodes.size) (c1, c2) else (c2, c1)
224 |         keep.edges += ((a1, a2))
225 |         keep.edges ++= discard.edges
226 |         keep.nodes ++= discard.nodes
227 |         for (n <- discard.nodes) components(n) = keep
228 |       }
229 |     }
230 |     val filteredComponents = components.values.toList.distinct.view.filter(c => c.nodes.exists(filter))
231 |     filteredComponents.map(c => c -> rules.copy(rules2 = rules.rules2.filterKeys(e => c.nodes(e._1) && c.nodes(e._2)))).toList
232 | 
233 |   }
234 | 
235 | 
236 | }
237 | 
238 | 
239 | 
240 | 
241 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/EntityAwareEvaluation.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import java.io.File
  4 | 
  5 | import com.typesafe.config.ConfigFactory
  6 | import ml.wolfe.Wolfe._
  7 | import uclmr.FactorizationUtil.Row
  8 | 
  9 | import scala.collection.mutable
 10 | import scala.util.Random
 11 | 
 12 | /**
 13 |  * @author Sebastian Riedel
 14 |  */
 15 | object EntityAwareEvaluation {
 16 | 
 17 |   case class Entity(entity: Any, counts: Map[String, Double]) {
 18 |     override def toString = {
 19 |       val sorted = counts.toSeq.sortBy(-_._2).map(p => p._1 + " " + p._2)
 20 |       s"""
 21 |         |-----
 22 |         |$entity
 23 |         |${ sorted.mkString("  ", "\n  ", "") }
 24 |       """.stripMargin
 25 |     }
 26 | 
 27 |     def asArg1 = counts.keys.map(_ + "#1")
 28 |     def asArg2 = counts.keys.map(_ + "#2")
 29 | 
 30 | 
 31 |   }
 32 | 
 33 |   def unaryToBinary(unary: String) = unary.substring(3, unary.length - 2)
 34 | 
 35 |   def entitiesFromRows(rows: Seq[Row]) = {
 36 |     val result = new mutable.HashMap[Any, mutable.HashMap[String, Double]]
 37 |     for (row <- rows) {
 38 |       val arg1Counts = result.getOrElseUpdate(row.arg1, new mutable.HashMap[String, Double]())
 39 |       val arg2Counts = result.getOrElseUpdate(row.arg2, new mutable.HashMap[String, Double]())
 40 | 
 41 |       for ((rel, value) <- row.relations) {
 42 |         val a1 = "A1#" + rel
 43 |         val a2 = "A2#" + rel
 44 |         arg1Counts(a1) = arg1Counts.getOrElse(a1, 0.0) + value
 45 |         arg2Counts(a2) = arg1Counts.getOrElse(a2, 0.0) + value
 46 |       }
 47 |     }
 48 |     result.map(p => p._1 -> Entity(p._1, p._2.toMap)).toMap
 49 |   }
 50 | 
 51 |   def joinRules(rules: Seq[Rules]) = {
 52 | 
 53 |     val result = new mutable.HashMap[(String, String), Rule2]
 54 |     val singleCounts = new mutable.HashMap[String, Double] withDefaultValue 0.0
 55 |     for (ruleMap <- rules.view.map(_.rules2)) {
 56 |       for (((r1, r2), rule) <- ruleMap) {
 57 |         result.get((r1, r2)) match {
 58 |           case Some(oldRule) =>
 59 |             result((r1, r2)) = oldRule + rule
 60 |           case None =>
 61 |             result((r1, r2)) = rule //todo: we should use updated single counts if seen in previous rule maps
 62 |         }
 63 |       }
 64 |     }
 65 |     Rules(result.toMap)
 66 |   }
 67 | 
 68 |   import EmbeddedProbLogicEvaluation._
 69 | 
 70 |   def main(args: Array[String]) {
 71 |     implicit val conf = ConfigFactory.parseFile(new File("conf/epl-ent.conf"))
 72 |     implicit val random = new Random(0)
 73 |     assert(!conf.entrySet().isEmpty, "Couldn't find configuration file.")
 74 | 
 75 |     val subsample = conf.getDouble("epl.subsample")
 76 |     val priorRepulsion = conf.getDouble("epl.prior-repulsion")
 77 |     val cooccurMin = conf.getDouble("epl.min-cooccur")
 78 | 
 79 | 
 80 | 
 81 |     def relationFilter(rel: String) = rel.startsWith("path") || (rel.startsWith("REL$") && rel != "REL$NA")
 82 | 
 83 |     //load raw data
 84 |     val trainRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.train")), relationFilter)
 85 |     val train = FactorizationUtil.filterRows(random.shuffle(trainRaw.toBuffer), conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols"))
 86 | 
 87 |     val unlabeledRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.unlabeled")), relationFilter, skipUnlabeled = true)
 88 |     val unlabeled = FactorizationUtil.filterRows(unlabeledRaw.toSeq, conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols"), !_.startsWith("REL$"))
 89 |     val combined = if (conf.getBoolean("epl.use-unlabeled")) train ++ unlabeled else train
 90 | 
 91 | 
 92 |     //relations
 93 |     val trainRelations = combined.flatMap(_.relations.map(_._1)).distinct.sorted // REL$/book/book_edition/author_editor
 94 |     val freebaseRelations = trainRelations.filter(_.startsWith("REL$")) //Seq("REL$/business/person/company")//
 95 |     val surfacePatterns = trainRelations.filterNot(_.startsWith("REL$")).toSet
 96 | 
 97 |     val testRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.test")), relationFilter,
 98 |       skipUnlabeled = true, minObsCount = 1).toSeq
 99 |     val test = FactorizationUtil.filterRows(testRaw, 1, 1, surfacePatterns)
100 | 
101 |     println(trainRelations.size)
102 |     println(train.size)
103 |     println(unlabeled.size)
104 |     println(test.size)
105 | 
106 |     println("Extracting entities")
107 |     val entities = entitiesFromRows(train ++ unlabeled)
108 |     FactorizationUtil.saveToFile(entities.values.toSeq.sortBy(_.entity.toString), new File("/tmp/entities.txt"))
109 |     //val filteredEntities = entities.mapValues(e => e.copy(counts = e.counts.toSeq.sortBy(-_._2).take(5).toMap))
110 |     val filteredEntities = Map() ++ entities.mapValues(e => e.copy(counts = random.shuffle(e.counts.toSeq).take(20).toMap))
111 |     FactorizationUtil.saveToFile(filteredEntities.values.toSeq.sortBy(_.entity.toString), new File("/tmp/filtered-entities.txt"))
112 |     val testEntities = entitiesFromRows(test)
113 |     FactorizationUtil.saveToFile(testEntities.values.toSeq.sortBy(_.entity.toString), new File("/tmp/test-entities.txt"))
114 | 
115 | 
116 | 
117 |     val priorCounts = Map((true, false) -> priorRepulsion, (false, true) -> priorRepulsion) withDefaultValue 0.0
118 | 
119 |     println("Extracting Binary rules")
120 |     val trainRulesRaw =
121 |       if (conf.getBoolean("epl.combine-datasets")) RuleLearner.learn(combined, priorCounts)
122 |       else RuleLearner.learn(train, priorCounts) + RuleLearner.learn(unlabeled, priorCounts)
123 | 
124 |     println("Extracting Unary rules")
125 |     val rulesUnary = EntityRuleLearner.extractUnaryRules(filteredEntities, subSample = 0.01)
126 |     FactorizationUtil.saveToFile(rulesUnary.rules2.values.toArray.sortBy(-_.probs(true, true)), new File("/tmp/unary.txt"))
127 |     println("Extracting Unary-Binary rules")
128 |     //val rulesUnary2BinaryTrain = EntityRuleLearner.extractRel2UnaryRules(filteredEntities, train, subSample = 0.01)
129 |     val rulesUnary2BinaryCombined = EntityRuleLearner.extractRel2UnaryRules(filteredEntities, train ++ unlabeled, subSample = 0.01)
130 | 
131 |     //println(rulesUnary2BinaryTrain.rules2.get("A1#path#nsubj|<-nsubj<-have->dobj->|dobj:INV#2" -> "path#nn|<-nn<-station->prep->in->pobj->|pobj"))
132 |     //val rulesUnary2BinaryUnlabeled = EntityRuleLearner.extractRel2UnaryRules(filteredEntities, unlabeled, subSample = 0.01)
133 |     //println(rulesUnary2BinaryUnlabeled.rules2.get("A1#path#nsubj|<-nsubj<-have->dobj->|dobj:INV#2" -> "path#nn|<-nn<-station->prep->in->pobj->|pobj"))
134 |     //    val joined = joinRules(Seq(trainRulesRaw,rulesUnary,rulesUnary2BinaryTrain,rulesUnary2BinaryUnlabeled))//trainRulesRaw + rulesUnary + rulesUnary2BinaryTrain + rulesUnary2BinaryUnlabeled
135 |     val joined = joinRules(Seq(trainRulesRaw, rulesUnary, rulesUnary2BinaryCombined)) //trainRulesRaw + rulesUnary + rulesUnary2BinaryTrain + rulesUnary2BinaryUnlabeled
136 |     println("unary+binary: " + joined.rules2.size)
137 |     FactorizationUtil.saveToFile(joined.rules2.values.toSeq.sortBy(-_.cond1given2), new File("/tmp/unary-binary.txt"))
138 |     println(joined.rules2.get("A1#path#nsubj|<-nsubj<-have->dobj->|dobj:INV#2" -> "path#nn|<-nn<-station->prep->in->pobj->|pobj"))
139 | 
140 | 
141 |     val trainRules = Rules(joined.rules2.filter(p => p._2.cooccurCount >= 1 || random.nextDouble() < subsample))
142 |     //val trainRulesFiltered = trainRules.copy(rules2 = trainRules.)
143 | 
144 |     println(s"Original rule count: ${ joined.rules2.size }")
145 |     println(s"Filtered rule count: ${ trainRules.rules2.size }")
146 | 
147 | 
148 |     FactorizationUtil.saveToFile(trainRules.rules2.values.toSeq.sortBy(-_.cond1given2), new File("/tmp/ent-rules2.txt"))
149 | 
150 |     println(s"Embedding ${ trainRules.rules2.size } rules")
151 |     val ple = ProbLogicEmbedder.embed(trainRules)
152 | 
153 |     println("Prediction")
154 | 
155 |     val predictor = new EntityAwarePredictor(ple, testEntities)
156 |     val predictedFacts = test flatMap (row => predictor.predictAll(row, freebaseRelations))
157 | 
158 |     FactorizationUtil.saveToFile(predictedFacts.sortBy(-_.fact.score), new File("/tmp/ent-facts.txt"))
159 | 
160 |     if (conf.getBoolean("epl.print-comparisons")) {
161 |       println("Extracting learned rules")
162 |       val learnedRules = ple.pairwiseRules(trainRules.rules2.keys)
163 |       EmbeddedProbLogicEvaluation.compareRules(learnedRules, trainRules.rules2)
164 |     }
165 |   }
166 | 
167 | }
168 | 
169 | object EntityRuleLearner {
170 | 
171 |   import uclmr.EntityAwareEvaluation._
172 | 
173 |   def toRule(rel1: String, rel2: String,
174 |              pairCount: Int, singleCount1: Int, singleCount2: Int, normalizer: Double,
175 |              priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0) = {
176 |     val prob11 = (pairCount + priorCounts(true, true)) / normalizer
177 |     val prob10 = ((singleCount1 - pairCount) + priorCounts(true, false)) / normalizer
178 |     val prob01 = ((singleCount2 - pairCount) + priorCounts(false, true)) / normalizer
179 |     val prob00 = 1.0 - prob11 - prob10 - prob01
180 |     val probs = Map(
181 |       (true, true) -> prob11, (true, false) -> prob10,
182 |       (false, true) -> prob01, (false, false) -> prob00
183 |     )
184 |     Rule2(rel1, rel2, probs, 1.0, count = normalizer,
185 |       cond1given2 = prob11 / (prob01 + prob11),
186 |       cond2given1 = prob11 / (prob10 + prob11))
187 |   }
188 | 
189 |   def extractUnaryRules(entities: Map[Any, Entity],
190 |                         subSample: Double = 1.0,
191 |                         priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0) = {
192 | 
193 |     val pairCountsArg1 = mutable.HashMap[(String, String), Int]() withDefaultValue 0
194 |     val pairCountsArg2 = mutable.HashMap[(String, String), Int]() withDefaultValue 0
195 |     val singleCountsInArg1 = mutable.HashMap[String, Int]() withDefaultValue 0
196 |     val singleCountsInArg2 = mutable.HashMap[String, Int]() withDefaultValue 0
197 | 
198 |     println("Entities: " + entities.size)
199 |     for (ent <- entities.values) {
200 |       for (p <- ent.asArg1) singleCountsInArg1(p) += 1
201 |       for (p <- ent.asArg2) singleCountsInArg2(p) += 1
202 |       if (ent.entity == "Nevada") {
203 |         println(ent.asArg1.mkString(","))
204 |       }
205 |       if (ent.entity == "OPEC") {
206 |         println(ent.asArg1.mkString(","))
207 |       }
208 |       if (ent.asArg1.contains("A1#path#nn|<-nn<-secretary->appos->|appos#1")) {
209 |         println(ent.entity)
210 |         println("Blah: " + singleCountsInArg1("A1#path#nn|<-nn<-secretary->appos->|appos#1"))
211 |       }
212 |       for (p1 <- ent.asArg1; p2 <- ent.asArg1; if p1 != p2) {
213 |         pairCountsArg1(p1 -> p2) += 1
214 |       }
215 |       for (p1 <- ent.asArg2; p2 <- ent.asArg2; if p1 != p2) {
216 |         pairCountsArg2(p1 -> p2) += 1
217 |       }
218 |     }
219 |     println("Done counting")
220 |     val arg1s = singleCountsInArg1.keys.toArray.sorted
221 |     val arg2s = singleCountsInArg2.keys.toArray.sorted
222 |     val normalizer = entities.size.toDouble + priorCounts.values.sum
223 |     val result = new mutable.HashMap[(String, String), Rule2]()
224 |     println("Done sorting etc.")
225 |     for (i1 <- 0 until arg1s.size; i2 <- i1 + 1 until arg1s.size) {
226 |       val a1 = arg1s(i1)
227 |       val a2 = arg1s(i2)
228 |       if (a2 == "A1#path#nn|<-nn<-secretary->appos->|appos#1") {
229 |         println(toRule(a1, a2, pairCountsArg1(a1 -> a2), singleCountsInArg1(a1), singleCountsInArg1(a2), normalizer, priorCounts))
230 |       }
231 |       if (pairCountsArg1(a1, a2) >= 1 || random.nextDouble() < subSample)
232 |         result(a1 -> a2) = toRule(a1, a2, pairCountsArg1(a1 -> a2), singleCountsInArg1(a1), singleCountsInArg1(a2), normalizer, priorCounts)
233 |     }
234 |     for (i1 <- 0 until arg2s.size; i2 <- i1 + 1 until arg2s.size) {
235 |       val a1 = arg2s(i1)
236 |       val a2 = arg2s(i2)
237 |       if (a2 == "A1#path#nn|<-nn<-secretary->appos->|appos#2") {
238 |         println(toRule(a1, a2, pairCountsArg2(a1 -> a2), singleCountsInArg2(a1), singleCountsInArg2(a2), normalizer, priorCounts))
239 |       }
240 | 
241 |       if (pairCountsArg2(a1, a2) >= 1 || random.nextDouble() < subSample)
242 |         result(a1 -> a2) = toRule(a1, a2, pairCountsArg2(a1 -> a2), singleCountsInArg2(a1), singleCountsInArg2(a2), normalizer, priorCounts)
243 |     }
244 |     println("Done!")
245 |     Rules(result.toMap)
246 |   }
247 | 
248 | 
249 |   def extractRel2UnaryRules(entities: Map[Any, Entity],
250 |                             rows: Seq[Row],
251 |                             subSample: Double = 1.0,
252 |                             priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0) = {
253 | 
254 |     val pairCounts = mutable.HashMap[(String, String), Int]() withDefaultValue 0
255 |     val singleCounts = mutable.HashMap[String, Int]() withDefaultValue 0
256 |     val singleCountsArgs = mutable.HashMap[String, Int]() withDefaultValue 0
257 | 
258 | 
259 |     for (row <- rows) {
260 |       val cells = row.relations
261 |       val arg1 = entities(row.arg1)
262 |       val arg2 = entities(row.arg2)
263 |       for (cell <- cells) singleCounts(cell._1) += 1
264 |       for (a1 <- arg1.asArg1) singleCountsArgs(a1) += 1
265 |       for (a2 <- arg2.asArg2) singleCountsArgs(a2) += 1
266 | 
267 |       //we should avoid rules between unary and binary relations that are
268 |       for ((rel, _) <- cells; a1 <- arg1.asArg1 if unaryToBinary(a1) != rel) {
269 |         pairCounts(rel -> a1) += 1
270 |       }
271 |       for ((rel, _) <- cells; a2 <- arg2.asArg2 if unaryToBinary(a2) != rel) {
272 |         pairCounts(rel -> a2) += 1
273 |       }
274 |     }
275 | 
276 |     val normalizer = rows.size.toDouble + priorCounts.values.sum
277 | 
278 |     println("Done counting")
279 | 
280 |     val result = new mutable.HashMap[(String, String), Rule2]()
281 |     for (rel <- singleCounts.keys;
282 |          arg <- singleCountsArgs.keys) {
283 |       val (r1, r2, counts1, counts2) =
284 |         if (rel.compareTo(arg) < 0)
285 |           (rel, arg, singleCounts(rel), singleCountsArgs(arg))
286 |         else
287 |           (arg, rel, singleCountsArgs(arg), singleCounts(rel))
288 |       if (pairCounts(r1, r2) >= 1 || random.nextDouble() < subSample) {
289 |         //        result(rel -> arg) = toRule(rel, arg, pairCounts(rel, arg), singleCounts(rel), singleCountsArgs(arg), normalizer, priorCounts)
290 |         result((r1, r2)) = toRule(r1, r2, pairCounts(r1, r2), counts1, counts2, normalizer, priorCounts)
291 |       }
292 | 
293 | 
294 |       //      if (arg == "A2#path#dobj|<-dobj<-replace->prep->in->pobj->|pobj#2") {
295 |       //        val rule = toRule(rel, arg, pairCounts(rel, arg), singleCounts(rel), singleCountsArgs(arg), normalizer, priorCounts)
296 |       //        if (rule.cond1given2 > 0.9)
297 |       //          println(rule)
298 |       //      }
299 | 
300 |     }
301 |     println("Done!")
302 |     Rules(result.toMap)
303 |   }
304 | 
305 | }
306 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/EntityAwarePredictor.scala:
--------------------------------------------------------------------------------
 1 | package uclmr
 2 | 
 3 | import uclmr.EntityAwareEvaluation.Entity
 4 | import uclmr.FactorizationUtil.{PredictedFact, Row}
 5 | import ml.wolfe.util.Util
 6 | 
 7 | import scala.collection.mutable
 8 | 
 9 | /**
10 |  * @author Sebastian Riedel
11 |  */
12 | class EntityAwarePredictor(val embeddings: ProbLogicEmbeddings, val entities: Map[Any, Entity]) {
13 | 
14 |   val distanceCache = new mutable.HashMap[(String, String), Double]()
15 | 
16 |   def closest(candidates: Iterable[String], target: String) = {
17 |     if (candidates.isEmpty) ("NA", Double.PositiveInfinity)
18 |     else
19 |       candidates.map(pred => {
20 |         val dist = distanceCache.getOrElseUpdate(pred -> target,
21 |           embeddings.embeddings(target).distance(embeddings.embeddings(pred)))
22 |         pred -> dist
23 |       }).minBy(_._2)
24 |   }
25 | 
26 |   def farthest(candidates: Iterable[String], target: String) = {
27 |     if (candidates.isEmpty) ("NA", Double.PositiveInfinity)
28 |     else
29 |       candidates.map(pred => {
30 |         val dist = distanceCache.getOrElseUpdate(pred -> target,
31 |           embeddings.embeddings(target).distance(embeddings.embeddings(pred)))
32 |         pred -> dist
33 |       }).maxBy(_._2)
34 |   }
35 | 
36 |   def predictAll(row: Row, targetRelations:Seq[String], useFilter:Boolean = true) = {
37 |     targetRelations.map(predict(row,_,useFilter))
38 |   }
39 | 
40 |   import EntityAwareEvaluation._
41 | 
42 |   def predict(row: Row, target: String, useFilter:Boolean = true) = {
43 |     val arg1 = entities(row.arg1)
44 |     val arg2 = entities(row.arg2)
45 | 
46 |     val targetEmbedding = embeddings.embeddings(target)
47 | 
48 |     def filterObs(obs:Iterable[String]) = if (useFilter) obs.filter(targetEmbedding.observationFilter) else obs
49 |     def asProb(pair:(String,Double)) = pair.copy(_2 = Util.sig(targetEmbedding.bias - pair._2))
50 | 
51 |     //find best unary predicate for arg1
52 |     val arg1Result = closest(filterObs(arg1.asArg1), target)
53 |     //find best unary predicate for arg2
54 |     val arg2Result = closest(filterObs(arg2.asArg2), target)
55 |     //find best binary predicate as observation
56 |     val relResult = closest(filterObs(row.relations.view.map(_._1)), target)
57 | 
58 |     val (predictor, score) = Iterator(arg1Result, arg2Result, relResult).maxBy(_._2)
59 | 
60 |     val prob = Util.sig(targetEmbedding.bias - score)
61 |     EntityAwarePrediction(
62 |       PredictedFact(row, target, prob), predictor,
63 |       asProb(arg1Result), asProb(arg2Result), asProb(relResult)
64 |     )
65 |   }
66 | 
67 | }
68 | 
69 | case class EntityAwarePrediction(fact: PredictedFact, predictor: String,
70 |                                  arg1Result: (String, Double), arg2Result: (String, Double), relResult: (String, Double)) {
71 |   override def toString = {
72 |     s"""
73 |       |$fact
74 |       |  Predictor: $predictor
75 |       |  Arg1:      $arg1Result
76 |       |  Arg2:      $arg2Result
77 |       |  Rel:       $relResult
78 |     """.stripMargin
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/FactorizationUtil.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import java.io._
  4 | 
  5 | import scala.collection.mutable
  6 | import scala.collection.mutable.ArrayBuffer
  7 | import scala.io.Source
  8 | import scala.util.Random
  9 | 
 10 | /**
 11 |  * @author Sebastian Riedel
 12 |  */
 13 | object FactorizationUtil {
 14 | 
 15 |   case class Row(arg1: Any, arg2: Any, relations: Seq[(String, Double)], hidden: Set[String] = Set.empty) {
 16 |     def rowName = s"($arg1,$arg2)"
 17 |     def observedTrue = relations.filter(_._2 > 0.5).map(_._1)
 18 |   }
 19 | 
 20 |   def sampleRows(rows: Int, rels: Int, density: Double = 0.1)(implicit random: Random) = {
 21 |     for (pair <- 0 until rows) yield {
 22 |       val cells = for (rel <- 0 until rels; if random.nextDouble() <= density) yield ("r" + rel, 1.0)
 23 |       Row(pair.toString, pair.toString, cells)
 24 |     }
 25 |   }
 26 | 
 27 |   def loadLiminFile(file: File,
 28 |                     relationFilter: String => Boolean = _ => true,
 29 |                     freebaseLabels: Seq[String] = Seq(), minObsCount: Int = 2, skipUnlabeled: Boolean = false): Iterator[Row] = {
 30 |     val source = Source.fromFile(file, "ISO-8859-1")
 31 |     for (line <- source.getLines();
 32 |          split = line.split("\t");
 33 |          arg1 = split(1);
 34 |          arg2 = split(2);
 35 |          filteredRelations = split.drop(3).filter(relationFilter)
 36 |          if filteredRelations.size >= minObsCount && (!skipUnlabeled || split(0) != "UNLABELED")
 37 |     ) yield {
 38 | 
 39 |       val asSet = filteredRelations.toSet
 40 |       //POSITIVE: entity pair in freebase, and one relation was seen
 41 |       //NEGATIVE: entity pair in freebase, but no relation was observed, this means that we can
 42 |       // more confidently label them negative
 43 |       //UNLABELLED: entity pair not in freebase, in some sense
 44 |       val cells = split(0) match {
 45 |         case "POSITIVE" => filteredRelations.map((_, 1.0)) ++ freebaseLabels.filterNot(asSet).map((_, 0.0))
 46 |         case "NEGATIVE" => filteredRelations.map((_, 1.0)) ++ freebaseLabels.map((_, 0.0))
 47 |         case "UNLABELED" => filteredRelations.map((_, 1.0))
 48 |       }
 49 |       Row(arg1, arg2, cells)
 50 |     }
 51 |   }
 52 | 
 53 |   def filterRows(rows: Seq[Row], minRowCount: Int = 10, minColCount: Int = 2, relFilter: String => Boolean = _ => true): Seq[Row] = {
 54 |     //rule: every row should have at least minColCount active cells, and each column needs minRowCount.
 55 |     val counts = new mutable.HashMap[String, Double]() withDefaultValue 0.0
 56 |     for (row <- rows; (rel, value) <- row.relations if relFilter(rel)) counts(rel) += value
 57 | 
 58 |     for (row <- rows;
 59 |          cells = row.relations.filter(c => counts(c._1) >= minRowCount)
 60 |          if cells.size >= minColCount) yield {
 61 |       row.copy(relations = cells)
 62 |     }
 63 |   }
 64 | 
 65 |   def filterRowsPairwise(rows: Seq[Row], minPairCount: Int = 3): Seq[Row] = {
 66 |     //alternative: each relation should have at least one other relation with minPair
 67 |     val counts = new mutable.HashMap[(String, String), Double]() withDefaultValue 0.0
 68 |     for (row <- rows;
 69 |          (rel1, value1) <- row.relations;
 70 |          (rel2, value2) <- row.relations if rel1 != rel2) counts(rel2 -> rel2) += value1 * value2
 71 |     val maxCounts = counts.toSeq.groupBy(_._1._1).mapValues(_.view.map(_._2).max)
 72 |     for (row <- rows;
 73 |          cells = row.relations.filter(c => maxCounts(c._1) >= minPairCount)) yield row.copy(relations = cells)
 74 |   }
 75 | 
 76 | 
 77 |   case class PredictedFact(row: Row, relation: String, score: Double) {
 78 |     override def toString = s"$score\t$relation\t${ row.rowName }\t${ row.observedTrue.mkString(" ") }"
 79 |     def toUSchemaString = s"$score\t${ row.arg1 }\t${ row.arg2 }\tREL${ "$NA" }\t$relation"
 80 |   }
 81 | 
 82 |   def toRankedFacts(predictions: Seq[(Row, Row)]): Seq[PredictedFact] = {
 83 |     val facts = for ((obs, guess) <- predictions; (rel, value) <- guess.relations) yield PredictedFact(obs, rel, value)
 84 |     val sorted = facts.sortBy(-_.score)
 85 |     sorted
 86 |   }
 87 | 
 88 |   def saveToFile(content: String, file: File): Unit = {
 89 |     val out = new PrintStream(file)
 90 |     out.println(content)
 91 |     out.close()
 92 |   }
 93 | 
 94 |   def saveToFile[T](content: Iterable[T], file: File): Unit = {
 95 |     val out = new PrintWriter(new BufferedWriter(new FileWriter(file)))
 96 |     for (line <- content)
 97 |       out.println(line.toString)
 98 |     out.close()
 99 |   }
100 | 
101 | 
102 |   def saveForUSchemaEval(facts: Seq[PredictedFact], file: File): Unit = {
103 |     val out = new PrintStream(file)
104 |     for (fact <- facts) {
105 |       out.println(fact.toUSchemaString)
106 |     }
107 |     out.close()
108 |   }
109 | 
110 |   def renderPredictions(prediction: Seq[Row], truth: Seq[Row] = Seq.empty) = {
111 |     import ml.wolfe.util.ANSIFormatter._
112 |     val relations =
113 |       (prediction.flatMap(_.relations.map(_._1)) ++ truth.flatMap(_.relations.map(_._1))).distinct.sorted
114 |     val colWidth = math.max(relations.map(_.toString.length).max + 1, 5)
115 |     val firstColWidth = prediction.map(_.rowName.length).max + 1
116 | 
117 |     val colFormat = "%" + colWidth + "s"
118 |     val firstColFormat = "%" + firstColWidth + "s"
119 |     val cellFormat = "%" + (colWidth - 1) + "s "
120 |     val pFormat = "%4.2f"
121 | 
122 |     val sb = new mutable.StringBuilder()
123 |     sb ++= " " * firstColWidth
124 |     relations.foreach(col => sb ++= colFormat.format(col))
125 |     sb ++= "\n"
126 | 
127 |     val truthMap = truth.map(r => (r.arg1, r.arg2) -> r).toMap
128 | 
129 |     for (row <- prediction) {
130 |       val trueRow = truthMap.get((row.arg1, row.arg2))
131 |       sb ++= firstColFormat.format(row.rowName) + " "
132 |       val col2value = row.relations.toMap withDefaultValue 0.0
133 |       val col2trueValue = trueRow.map(_.relations.toMap).getOrElse(Map.empty)
134 |       for (col <- relations) {
135 |         val score = col2value(col)
136 |         val pString = cellFormat.format(pFormat.format(score))
137 |         val actualString = col2trueValue.get(col) match {
138 |           case Some(value) => if (value > 0.5) pString.onGreen() else pString
139 |           case None => pString
140 |         }
141 |         sb ++= actualString
142 |       }
143 |       sb ++= "\n"
144 | 
145 |     }
146 |     sb.toString()
147 | 
148 |   }
149 | 
150 |   def filterRankedFile(dest: String, filterTuple: String, source: String) {
151 |     val allowed = new mutable.HashSet[Seq[Any]]()
152 | 
153 |     val out = new PrintStream(dest)
154 | 
155 |     for (line <- Source.fromFile(filterTuple).getLines(); if line.trim != "") {
156 |       val split = line.split("\t")
157 |       val tuple = if (split.size == 2) Seq(split(0), split(1)) else Seq(split(1), split(2))
158 |       allowed += tuple
159 |     }
160 |     println(allowed.size)
161 | 
162 |     def norm(label: String) = if (label.contains("/") && !label.startsWith("REL$")) "REL$" + label else label
163 | 
164 |     for (line <- Source.fromFile(source).getLines()) {
165 |       val split = line.split("[\t]")
166 |       if (split(1).contains("|")) {
167 |         val tuple = split(1).split("\\|").toSeq
168 |         if (allowed(tuple)) out.println(split(0) + "\t" + tuple.mkString("\t") + "\t" + split.drop(2).map(norm).mkString("\t"))
169 |       } else {
170 |         val tuple = Seq(split(1), split(2))
171 |         if (allowed(tuple)) out.println(split.take(3).mkString("\t") + "\t" + split.drop(3).map(norm).mkString("\t"))
172 |       }
173 |     }
174 | 
175 |     out.close()
176 |   }
177 | 
178 |   def main(args: Array[String]) {
179 |     filterRankedFile(
180 |       "/tmp/ple-subsample.txt",
181 |       "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt",
182 |       "/tmp/ple.txt"
183 |     )
184 |   }
185 | 
186 | 
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/LogicalInference.scala:
--------------------------------------------------------------------------------
 1 | package uclmr
 2 | 
 3 | import uclmr.CellType.CellType
 4 | 
 5 | /**
 6 |  * @author rockt
 7 |  * Very basic logical inference. Assumes that the formulaList or the formulae in db are consistent.
 8 |  * todo: can be sped up by not touching premises twice
 9 |  */
10 | object LogicalInference {
11 |   def apply(db: TensorDB, formulaList: List[Formula] = Nil, newCellType: CellType = CellType.Train, usePredictions: Boolean = false, threshold: Double = 0.5): Unit = {
12 |     var converged = false
13 | 
14 |     val formulae = if (formulaList.isEmpty) db.formulae.toList else formulaList
15 |     while (!converged) {
16 |       converged = true
17 | 
18 |       for (formula <- formulae) formula match {
19 |         case Impl(p1, p2, _) =>
20 |           val cs = if (usePredictions) db.getPredictedBy1(p1, threshold) else db.getBy1(p1)
21 |           cs.foreach(c => {
22 |             val (c1, c2) = c
23 |             val cellOpt = db.get(p2, c1, c2)
24 | 
25 |             if (!cellOpt.isDefined || cellOpt.get.cellType != newCellType) {
26 |               converged = false
27 |               db += Cell(p2, c1, c2, target = 1.0, cellType = newCellType)
28 |             }
29 |           })
30 |         case ImplNeg(p1, p2, _) =>
31 |           val cs = if (usePredictions) db.getPredictedBy1(p1, threshold) else db.getBy1(p1)
32 |           cs.foreach(c => {
33 |             val (c1, c2) = c
34 |             val cellOpt = db.get(p2, c1, c2)
35 | 
36 |             if (!cellOpt.isDefined || cellOpt.get.cellType != newCellType) {
37 |               converged = false
38 |               db += Cell(p2, c1, c2, target = 0.0, cellType = newCellType)
39 |             }
40 |           })
41 |         case _ => ???
42 |       }
43 |     }
44 | 
45 |   }
46 | }
47 | 
48 | object LogicalInferenceSpec extends App {
49 |   val k = 5
50 |   val db = new TensorKB(k)
51 |   db.sampleTensor(10, 10, 0, 0.1)
52 |   db.toFactorGraph
53 | 
54 |   db += Cell("r6", "e6", DefaultIx, 0.0, CellType.Test)
55 | 
56 |   db += Impl("r4", "r6")
57 |   db += Impl("r6", "r2")
58 | 
59 |   println(db.toVerboseString())
60 | 
61 |   //fixme: second baseline actually needs to go over *predicted* true premises
62 |   LogicalInference(db, newCellType = CellType.Inferred, usePredictions = true, threshold = 0.49)
63 |   //LogicalInference(db, newCellType = CellType.Inferred)
64 | 
65 | 
66 |   println(db.toVerboseString())
67 | 
68 |   println("Inferred cells:\n" + db.inferredCells.mkString("\n"))
69 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/MatrixFactorization.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import java.io.{File, FileWriter}
  4 | 
  5 | import cc.factorie.la.SparseBinaryTensor1
  6 | import cc.factorie.optimize._
  7 | import uclmr.io._
  8 | import ml.wolfe.fg.{L2Regularization, _}
  9 | import ml.wolfe.util.{Conf, ProgressLogging, Timer}
 10 | import ml.wolfe.{DenseVector, GradientBasedOptimizer, Wolfe}
 11 | 
 12 | import scala.io.Source
 13 | import scala.util.Random
 14 | 
 15 | /**
 16 |  * @author Sebastian Riedel
 17 |  * @author rockt
 18 |  */
 19 | 
 20 | object MatrixFactorization extends App {
 21 |   val mf = new MatrixFactorization(args.lift(0).getOrElse("conf/mf.conf"))
 22 |   val wMAP = mf.run()
 23 |   println(wMAP)
 24 | }
 25 | 
 26 | class MatrixFactorization(confPath: String = "conf/mf.conf") {
 27 |   val debug = false //whether to use a small synthetic matrix or actual data
 28 |   val loadFormulae = debug && true //whether forumlae should be sampled for debugging
 29 |   //val print = false //whether to print the matrix (only do this for small ones!)
 30 | 
 31 |   Conf.add(confPath)
 32 |   Conf.outDir //sets up output directory
 33 |   implicit val conf = Conf
 34 |   println("Using " + confPath)
 35 | 
 36 |   val dataType = conf.getString("dataType")
 37 |   assert(dataType == "naacl" || dataType == "figer" || dataType == "tsv", s"dataType $dataType should be 'naacl' or 'figer' or 'tsv'.")
 38 |   val useFeatures = (dataType == "figer" && conf.getBoolean("figer.use-features")) || (dataType == "naacl" && conf.getBoolean("mf.use-features"))
 39 | 
 40 |   val outputPath = conf.getString("outDir")
 41 |   val fileName = conf.getString("mf.outFile")
 42 | 
 43 |   val mode = conf.getString("mf.mode")
 44 | 
 45 |   //model parameters
 46 |   val k = conf.getInt("mf.k")
 47 |   val lambda = conf.getDouble("mf.lambda")
 48 |   val alpha = conf.getDouble("mf.alpha")
 49 |   val maxIter = conf.getInt("mf.maxIter")
 50 | 
 51 |   val subsample = conf.getDouble("mf.subsample")
 52 |   val negPerPos = conf.getInt("mf.negPerPos")
 53 |   val unobservedPerF = conf.getInt("mf.unobservedPerF")
 54 | 
 55 |   val cellWeight = conf.getDouble("mf.cellWeight")
 56 |   val formulaeWeight = conf.getDouble("mf.formulaeWeight")
 57 | 
 58 |   val optimizer = conf.getString("mf.optimizer")
 59 |   val batchTraining = conf.getBoolean("mf.batchTraining")
 60 | 
 61 |   val bpr = conf.getBoolean("mf.bpr")
 62 | 
 63 |   val postInferenceThreshold = 0.5
 64 | 
 65 | 
 66 |   val db = if (debug) {
 67 |     val tmp = new TensorKB(4)
 68 |     tmp.sampleTensor(10, 10, 0, 0.1) //samples a matrix
 69 |     //tmp += Cell("r3", "r3-#premise")
 70 |     //tmp += Cell("r4", "r4-#consequent")
 71 |     if (loadFormulae) {
 72 |       tmp += Impl("r3", "r4")
 73 |       tmp += Impl("r4", "r6")
 74 |       tmp += Impl("r6", "r2")
 75 |       //tmp += ImplNeg("r8", "r6")
 76 |     }
 77 |     tmp
 78 |   } else dataType match {
 79 |     case "naacl" => LoadNAACL(k, subsample)
 80 |     case "figer" => LoadFIGER(k, subsample)
 81 |     case "tsv" => LoadTSV(k, subsample)
 82 |   }
 83 | 
 84 |   val rand = new Random(0l)
 85 | 
 86 |   val fg = db.toFactorGraph
 87 | 
 88 |   val trainDebugString = if (debug) db.toVerboseString(showTrain = true) else ""
 89 |   if (mode == "pre-inference" || mode == "pre-post-inference") LogicalInference(db, newCellType = CellType.Train)
 90 | 
 91 |   val data = rand.shuffle(db.trainCells)
 92 |   val colNodes = db.ix1ToNodeMap //cols
 93 |   val rowNodes = db.ix2ToNodeMap //rows
 94 | 
 95 |   //initialize embeddings
 96 |   //def nextInit() = (rand.nextDouble() - 0.5) * 0.1
 97 |   def nextInit() = rand.nextGaussian() * 0.1
 98 |   (colNodes.values.view ++ rowNodes.values.view).foreach(n =>
 99 |     n.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray))
100 |   if (useFeatures) db match {
101 |     case f: Features =>
102 |       f.fwnodes1.foreach(n => n.variable.asVector.b = new DenseVector((0 until f.numFeatures1).map(i => nextInit()).toArray))
103 |       f.fwnodes2.foreach(n => n.variable.asVector.b = new DenseVector((0 until f.numFeatures2).map(i => nextInit()).toArray))
104 |   }
105 | 
106 | 
107 | 
108 |   //fact factors
109 |     for (d <- data) {
110 |       val (colIx, rowIx, _) = d.key
111 |       val r = rowNodes(rowIx) //entity
112 |       val c = colNodes(colIx) //relation
113 | 
114 |       if (bpr) fg.buildStochasticFactor(Seq(r, db.sampleNodeFrom2(colIx), c))(_ map (_ => new VectorMsgs)) {
115 |         e => new BPRPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization
116 |       }
117 |       else {
118 |         if (useFeatures) db match {
119 |           case dbf: Features => {
120 |             // assumes only features on rows (weights for each column)
121 |             val fwnode = dbf.fwnode2(colIx).get
122 |             val fnode = dbf.fnode2(rowIx).get
123 |             fg.buildFactor(Seq(r, c, fwnode, fnode))(_ map (_ => new VectorMsgs)) {
124 |               e => new CellLogisticLossWithRowFeatures(e(0), e(1), e(2), e(3), 1.0, lambda, cellWeight) with L2Regularization
125 |             }
126 | 
127 |             (0 until negPerPos).foreach { i =>
128 |               fg.buildStochasticFactor({
129 |                 val nr = db.sampleNodeFrom2(colIx)
130 |                 val nrfnode = dbf.fnode2(nr.variable.label).get
131 |                 Seq(nr, c, fwnode, nrfnode)
132 |               })(_ map (_ => new VectorMsgs)) {
133 |                 e => new CellLogisticLossWithRowFeatures(e(0), e(1), e(2), e(3), 0.0, lambda, cellWeight / negPerPos) with L2Regularization
134 |               }
135 |             }
136 |           }
137 |         } else {
138 |           fg.buildFactor(Seq(r, c))(_ map (_ => new VectorMsgs)) {
139 |             e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, cellWeight) with L2Regularization
140 |           }
141 | 
142 |           (0 until negPerPos).foreach { i =>
143 |             fg.buildStochasticFactor(Seq(c, db.sampleNodeFrom2(colIx)))(_ map (_ => new VectorMsgs)) {
144 |               e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, cellWeight / negPerPos) with L2Regularization
145 |             }
146 |           }
147 |         }
148 |       }
149 |     }
150 | 
151 |     if (mode == "low-rank-logic") {
152 |       //formulae factors
153 |       for (d <- data) {
154 |         //colIx: relation
155 |         //rowIx: entity
156 |         val (colIx, rowIx, _) = d.key
157 | 
158 |         val a = rowNodes(rowIx)
159 |         val v = colNodes(colIx)
160 | 
161 |         for (formula <- db.formulaeByPredicate(colIx)) {
162 |           val cNode = v
163 |           if (formula.isFormula2) {
164 |             val Seq(p1, p2) = formula.predicates
165 | 
166 |             //can only inject formulae whose predicates exist
167 |             if (db.node1(p1).isDefined && db.node1(p2).isDefined) {
168 |               val p1Node = db.node1(p1).get
169 |               val p2Node = db.node1(p2).get
170 | 
171 |               formula match {
172 |                 case Impl(_, _, target) =>
173 |                   fg.buildFactor(Seq(cNode, p1Node, p2Node))(_ map (_ => new VectorMsgs)) {
174 |                     e => new ImplPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization
175 |                   }
176 |                   (0 until unobservedPerF).foreach { i =>
177 |                     fg.buildStochasticFactor(Seq(db.sampleNodeFrom2(colIx, sampleTestRows = Conf.getBoolean("mf.test-row-terms")), p1Node, p2Node))(_ map (_ => new VectorMsgs)) {
178 |                       e => new ImplPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization
179 |                     }
180 |                   }
181 | 
182 |                 case ImplNeg(_, _, target) =>
183 |                   fg.buildFactor(Seq(cNode, p1Node, p2Node))(_ map (_ => new VectorMsgs)) {
184 |                     e => new ImplNegPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization
185 |                   }
186 |                   (0 until unobservedPerF).foreach { i =>
187 |                     fg.buildStochasticFactor(Seq(db.sampleNodeFrom2(colIx, sampleTestRows = Conf.getBoolean("mf.test-row-terms")), p1Node, p2Node))(_ map (_ => new VectorMsgs)) {
188 |                       e => new ImplNegPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization
189 |                     }
190 |                   }
191 |               }
192 |             }
193 |           } else {
194 |             ???
195 |           }
196 |         }
197 |       }
198 |     }
199 |   if (mode == "gen-fake-data") {
200 |     //formulae factors
201 |     var fidx = 0
202 |     val numFakeData = conf.getInt("mf.num-fake-data")
203 |     val weight = numFakeData * formulaeWeight
204 |     for (formula <- db.formulae) {
205 |       if (formula.isFormula2) {
206 |         val Seq(p1, p2) = formula.predicates
207 |         //can only inject formulae whose predicates exist
208 |         if (db.node1(p1).isDefined && db.node1(p2).isDefined) {
209 |           val p1Node = db.node1(p1).get
210 |           val p2Node = db.node1(p2).get
211 | 
212 |           formula match {
213 |             case Impl(_, _, target) => {
214 |               println("Adding fake cells for formula: " + p1.toString + " -> " + p2.toString)
215 |               val e11 = fg.addVectorNode(k, "e11+" + fidx)
216 |               e11.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)
217 |               fg.buildFactor(Seq(p1Node, e11))(_ map (_ => new VectorMsgs)) {
218 |                 e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization
219 |               }
220 |               fg.buildFactor(Seq(p2Node, e11))(_ map (_ => new VectorMsgs)) {
221 |                 e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization
222 |               }
223 |               val e01 = fg.addVectorNode(k, "e01+" + fidx)
224 |               e01.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)
225 |               fg.buildFactor(Seq(p1Node, e01))(_ map (_ => new VectorMsgs)) {
226 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
227 |               }
228 |               fg.buildFactor(Seq(p2Node, e01))(_ map (_ => new VectorMsgs)) {
229 |                 e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization
230 |               }
231 |               val e00 = fg.addVectorNode(k, "e00+" + fidx)
232 |               e00.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)
233 |               fg.buildFactor(Seq(p1Node, e00))(_ map (_ => new VectorMsgs)) {
234 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
235 |               }
236 |               fg.buildFactor(Seq(p2Node, e00))(_ map (_ => new VectorMsgs)) {
237 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
238 |               }
239 |             }
240 |             case ImplNeg(_, _, target) => {
241 |               println("Adding fake cells for formula: " + p1.toString + " -> !" + p2.toString)
242 |               val e10 = fg.addVectorNode(k, "e10+" + fidx)
243 |               e10.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)
244 |               fg.buildFactor(Seq(p1Node, e10))(_ map (_ => new VectorMsgs)) {
245 |                 e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization
246 |               }
247 |               fg.buildFactor(Seq(p2Node, e10))(_ map (_ => new VectorMsgs)) {
248 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
249 |               }
250 |               val e01 = fg.addVectorNode(k, "e01+" + fidx)
251 |               e01.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)
252 |               fg.buildFactor(Seq(p1Node, e01))(_ map (_ => new VectorMsgs)) {
253 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
254 |               }
255 |               fg.buildFactor(Seq(p2Node, e01))(_ map (_ => new VectorMsgs)) {
256 |                 e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization
257 |               }
258 |               val e00 = fg.addVectorNode(k, "e00+" + fidx)
259 |               e00.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)
260 |               fg.buildFactor(Seq(p1Node, e00))(_ map (_ => new VectorMsgs)) {
261 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
262 |               }
263 |               fg.buildFactor(Seq(p2Node, e00))(_ map (_ => new VectorMsgs)) {
264 |                 e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization
265 |               }
266 |             }
267 |           }
268 |         }
269 |         fidx += 1
270 |       } else {
271 |         ???
272 |       }
273 |     }
274 |   }
275 | 
276 |   fg.build()
277 | 
278 | 
279 |   println("DB:" + db.toInfoString)
280 |   println("FG:" + fg.toInspectionString)
281 | 
282 |   val gradientOptimizer = optimizer match {
283 |     case "SGD" => new ConstantLearningRate(baseRate = alpha)
284 |     case "AdaGrad" => new AdaGrad(rate = alpha)
285 |     case "AdaMira" => new AdaMira(rate = alpha) //rockt: doesn't seem to make a difference to AdaGrad
286 |     case "LBFGS" => new LBFGS(Double.MaxValue, Int.MaxValue) //rockt: not working atm
287 |     case "AvgPerceptron" => new AveragedPerceptron()
288 |   }
289 | 
290 | 
291 |   def run(): Double = {
292 |     println("Optimizing...")
293 |     Timer.time("optimization") {
294 |       if (mode != "inference-only")
295 |         GradientBasedOptimizer(fg,
296 |           if (batchTraining) new BatchTrainer(_, gradientOptimizer, maxIter) with ProgressLogging
297 |           else new OnlineTrainer(_, gradientOptimizer, maxIter, fg.factors.size - 1) with ProgressLogging
298 |         )
299 | 
300 |       if (mode == "post-inference" || mode == "pre-post-inference")
301 |         LogicalInference(db, newCellType = CellType.Inferred, usePredictions = true, threshold = postInferenceThreshold)
302 |       if (mode == "inference-only")
303 |         LogicalInference(db, newCellType = CellType.Inferred)
304 |     }
305 |     println("Done after " + Timer.reportedVerbose("optimization"))
306 | 
307 |     var wMAP = 0.0
308 | 
309 |     if (debug) {
310 |       println("train:")
311 |       println(trainDebugString)
312 |       println()
313 |       println("predicted:")
314 |       println(db.toVerboseString())
315 |       if (Conf.getBoolean("serialize")) db.serialize(Conf.outDir.getAbsolutePath + "/serialized/")
316 |     } else {
317 |       Conf.createSymbolicLinkToLatest() //rewire symbolic link to latest (in case it got overwritten)
318 |       val pathToPredict = Conf.outDir.getAbsolutePath + "/" + fileName
319 |       dataType match {
320 |         case "naacl" =>
321 |           WriteNAACL(db, pathToPredict)
322 |           val evalConf = "./conf/" + Conf.getString("evalConf")
323 |           wMAP = new EvaluateNAACL(evalConf, pathToPredict).eval()
324 |         case "figer" =>
325 |           WriteFIGER(db, pathToPredict)
326 |           EvaluateFIGER.main(Array(pathToPredict, Conf.outDir.getAbsolutePath))
327 |         case "tsv" => //todo: write out predictions (for all cells?)
328 |       }
329 | 
330 |       //db.writeVectors(Conf.outDir.getAbsolutePath + "/vectors.tsv")
331 | 
332 |       if (Conf.getBoolean("serialize")) db.serialize(Conf.outDir.getAbsolutePath + "/serialized/")
333 | 
334 | 
335 | 
336 |       import scala.sys.process._
337 |       Process("pdflatex -interaction nonstopmode -shell-escape table.tex", new File(Conf.outDir.getAbsolutePath)).!!
338 | 
339 |       if (Conf.hasPath(dataType + ".formulaeFile") && Conf.getString(dataType + ".formulaeFile") != "None") {
340 |         val formulaeFile = new File(Conf.getString(dataType + ".formulaeFile"))
341 |         val lines = Source.fromFile(formulaeFile).getLines()
342 |         val writer = new FileWriter(Conf.outDir.getAbsolutePath + "/" + formulaeFile.getAbsolutePath.split("/").last)
343 |         writer.write(lines.mkString("\n"))
344 |         writer.close()
345 |       }
346 |     }
347 | 
348 |     wMAP
349 |   }
350 | }
351 | 
352 | object WolfeStyleMF extends App {
353 | 
354 |   import ml.wolfe.Wolfe._
355 |   import ml.wolfe.macros.OptimizedOperators._
356 |   case class Data(rel:String, arg1:String, arg2:String, target:Double)
357 | 
358 |   case class Model(relationVectors:Map[String,Seq[Double]], entityPairVectors:Map[(String,String),Seq[Double]])
359 | 
360 |   def dot(a1:Seq[Double],a2:Seq[Double]) = ???
361 | 
362 |   val rels = Seq("profAt")
363 |   val ents = Seq("Luke" -> "MIT")
364 | 
365 | 
366 |   def searchSpace(k:Int) = all(Model)(maps(rels,fvectors(k)) x maps(ents,fvectors(k)))
367 | 
368 |   def fvectors(k:Int) = Wolfe.seqsOfLength(k,Wolfe.doubles)
369 | 
370 | 
371 | 
372 |   //@Potential(???) //cell logistic potential
373 |   def logisticLoss(target:Double, arg1:Seq[Double], arg2:Seq[Double]) =
374 |   //todo: sigmoid
375 |     sum(0 until arg1.length) { i => arg1(i) * arg2(i) }
376 | 
377 |   //@Stochastic(String => (String, String)) //samples a non-observed pair efficiently from data; not for now
378 |   //creates as many stochastic factors as the integer before the sum
379 |   @Stochastic
380 |   def negativeDataLoss(data: Seq[Data])(model: Model) = {
381 |     val r = data.head.rel
382 |     val numObserved = data.size //function of r
383 |     val numUnobserved = ents.size - numObserved
384 | 
385 |     //there needs to be a default implementation that takes the filtered domain (ents) and samples from it
386 |     numObserved * sum(ents filter { pair => !data.exists(d => pair == (d.arg1, d.arg2)) }){ pair =>
387 |       logisticLoss(0.0, model.entityPairVectors(pair), model.relationVectors(r)) * (numUnobserved / numObserved.toDouble)
388 |     }
389 |   }
390 | 
391 |   def objective(data:Seq[Data])(model:Model) = {
392 |     sum(data) { d => logisticLoss(d.target,model.entityPairVectors(d.arg1 -> d.arg2), model.relationVectors(d.rel)) } +
393 |     sum(rels) { r => negativeDataLoss(data.filter(_.rel == r))(model) }
394 |   }
395 | 
396 |   println("It compiles, yay! :)")
397 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/PimpMyFactorie.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import cc.factorie.la._
  4 | import scala.Array
  5 | import cc.factorie.util.SparseDoubleSeq
  6 | import scala.language.implicitConversions
  7 | 
  8 | /**
  9 |  * @author rockt
 10 |  */
 11 | class PimpMyFactorie {
 12 | 
 13 | }
 14 | 
 15 | 
 16 | object PimpMyFactorie {
 17 |   //FIXME: for some reason this methods is not applied implicitly
 18 |   implicit def applyElementwise(fun: Double => Double): (Tensor => Tensor) = {
 19 |     (tensor: Tensor) =>
 20 |       for ((ix, value) <- tensor.activeElements) tensor.update(ix, fun(value))
 21 |       tensor
 22 |   }
 23 | 
 24 |   implicit class PimpedTensor(self: Tensor) {
 25 |     def toPrettyString: String = self match {
 26 |       //case sparse: SparseDoubleSeq => sparse.activeElements.map(t => t._1 + "\t" + t._2).mkString("\n")
 27 |       case tensor1: Tensor1 => tensor1.asArray.mkString("\n")
 28 |       case tensor2: Tensor2 => (0 until tensor2.dim1).map(row => (0 until tensor2.dim2).map(col => tensor2(row, col)).mkString(" ")).mkString("\n")
 29 |       case tensor3: Tensor3 =>
 30 |         (0 until tensor3.dim2).map(row => (0 until tensor3.dim1).map(layer =>
 31 |           (0 until tensor3.dim3).map(col => tensor3(layer, row, col)).mkString(" ")
 32 |         ).mkString(" | ")).mkString("\n")
 33 |     }
 34 |     def toDimensionsString: String = self match {
 35 |       case tensor1: Tensor1 => tensor1.dim1.toString
 36 |       case tensor2: Tensor2 => s"${tensor2.dim1}×${tensor2.dim2}"
 37 |       case tensor3: Tensor3 => s"${tensor3.dim1}×${tensor3.dim2}×${tensor3.dim3}"
 38 |     }
 39 |     def vectorization: Tensor1 = new DenseTensor1(self.asArray)
 40 | 
 41 |     /**
 42 |      * Two tensors are equal if the have the same dimensions and values
 43 |      */
 44 |     def ===(obj: scala.Any): Boolean = (obj, self) match {
 45 |       case (other: Tensor1, self: Tensor1) => {
 46 |         if (other.dim1 != self.dim1) false
 47 |         else {
 48 |           for (i <- 0 until self.dim1)
 49 |             if (self(i) != other(i)) return false
 50 |           true
 51 |         }
 52 |       }
 53 |       case (other: Tensor2, self: Tensor2) => {
 54 |         if (other.dim1 != self.dim1) false
 55 |         if (other.dim2 != self.dim2) false
 56 |         else {
 57 |           for {
 58 |             i <- 0 until self.dim1
 59 |             j <- 0 until self.dim2
 60 |           }
 61 |             if (self(i,j) != other(i,j)) return false
 62 |           true
 63 |         }
 64 |       }
 65 |       case (other: Tensor3, self: Tensor3) => {
 66 |         if (other.dim1 != self.dim1) false
 67 |         if (other.dim2 != self.dim2) false
 68 |         if (other.dim3 != self.dim3) false
 69 |         else {
 70 |           for {
 71 |             i <- 0 until self.dim1
 72 |             j <- 0 until self.dim2
 73 |             k <- 0 until self.dim3
 74 |           }
 75 |             if (self(i,j,k) != other(i,j,k)) return false
 76 |           true
 77 |         }
 78 |       }
 79 |       case _ => self.equals(obj)
 80 |     }
 81 |   }
 82 | 
 83 |   implicit class PimpedTensor1(self: Tensor1) {
 84 |     def t: Tensor2 = new DenseTensor2(Array(self.asArray))
 85 |     def slice(from: Int, to: Int): Tensor1 = new DenseTensor1(self.asArray.slice(from, to))
 86 |     def *(tensor2: Tensor2): Tensor1 = tensor2.leftMultiply(self)
 87 | 
 88 |     def <>(tensor1: Tensor1): Tensor2 = self.outer(tensor1).asInstanceOf[Tensor2]
 89 |   }
 90 | 
 91 |   /**
 92 |    * Pimped tensor2 with dim1 = rows, dim2 = columns
 93 |    */
 94 |   implicit class PimpedTensor2(self: Tensor2) {
 95 |     /**
 96 |      * Returns the transpose of the matrix
 97 |      */
 98 |     def t: Tensor2 = {
 99 |       new DenseTensor2(self) {
100 |         override protected def _initialArray: Array[Double] = self.asArray
101 |         override val dim1 = self.dim2
102 |         override val dim2 = self.dim1
103 |         override def apply(i: Int, j: Int): Double = self.apply(j, i)
104 |       }
105 |     }
106 | 
107 |     //TODO: make this more efficient
108 |     def multiply(other: Tensor2): Tensor2 = {
109 |       require(self.dim2 == other.dim1, s"${self.dim1}x${self.dim2} * ${other.dim1}x${other.dim2}")
110 |       val tmp = new DenseTensor2(self.dim1, other.dim2)
111 |       for {
112 |         i <- 0 until self.dim1
113 |         j <- 0 until other.dim2
114 |       } tmp.update(i, j, (for (k <- 0 until self.dim2) yield self(i, k) * other(k, j)).sum)
115 |       tmp
116 |     }
117 | 
118 |     //rockt: inefficient?
119 |     def reshape(rows: Int, columns: Int): Tensor2 = {
120 |       require(rows * columns == self.dim1 * self.dim2)
121 |       new DenseTensor2(self.asSeq.grouped(columns).toArray)
122 |     }
123 | 
124 |     /**
125 |      * Updates the ith column with tensor1
126 |      */
127 |     def update(i: Int, tensor1: Tensor1) = {
128 |       require(self.dim1 == tensor1.dim1)
129 |       for (j <- 0 until self.dim1) self.update(j, i, tensor1(j))
130 |     }
131 | 
132 |     def mul(value: Double): Tensor2 = (self * value).asInstanceOf[Tensor2]
133 | 
134 |     def getRow(ix: Int): Tensor1 =
135 |       new DenseTensor1((for (i <- 0 until self.dim2) yield self(ix, i)).toArray)
136 | 
137 |     //only works if tensor2 is a SparseBinaryTensor2
138 |     def getSparseRow(ix: Int): SparseTensor1 = {
139 |       val matrix = self.asInstanceOf[SparseBinaryTensor2]
140 |       val v = new SparseTensor1(self.dim2)
141 | 
142 |       val minIx = ix * self.dim2
143 |       val maxIx = (ix + 1) * self.dim2
144 | 
145 |       val elems = matrix.activeElements.filter(p => minIx <= p._1 && p._1 < maxIx)
146 | 
147 |       elems.foreach(p => {
148 |         val (ix, value) = p
149 |         v.update(ix % self.dim2, value)
150 |       })
151 | 
152 |       v
153 |     }
154 |   }
155 | 
156 |   /**
157 |    * Pimped tensor3 with dim1 = layers, dim2 = rows, dim3 = columns
158 |    */
159 |   implicit class PimpedTensor3(self: Tensor3) {
160 |     /**
161 |      * Multitplies the tensor with a vector in mode 1, i.e., inner product with every mode 1 (tube) fiber.
162 |      * TODO: generalize this to mode 2 and mode 3
163 |      * TODO: this method is the performance bottleneck: use DenseLayeredTensor3 and pick the vectors you need!
164 |      * TODO: is there a parallel implementation for this?
165 |      */
166 |     def firstModeVectorProduct(tensor1: Tensor1): Tensor2 = tensor1 match {
167 |       case t: SparseTensor =>
168 |         //FIXME: this is currently not general, since it only works for calculating the 2*1 score
169 |         require(self.dim2 == 2 && self.dim3 == 1)
170 |         val result = new DenseTensor2(self.dim2, self.dim3)
171 |         var sum0 = 0.0
172 |         var sum1 = 0.0
173 | 
174 |         t.activeElements.foreach(elem => {
175 |           val (ix, value) = elem
176 |           sum0 += self(ix, 0, 0) * value
177 |           sum1 += self(ix, 1, 0) * value
178 |         })
179 | 
180 |         result.update(0, 0, sum0)
181 |         result.update(1, 0, sum1)
182 |         result
183 |       case _ =>
184 |         //println(tensor1.getClass)
185 |         //require(self.dim1 == tensor1.dim1, s"${self.toDimensionsString} * ${tensor1.toDimensionsString}")
186 |         require(self.dim1 == tensor1.dim1)
187 | 
188 |         val tensor2 = new DenseTensor2(self.dim2, self.dim3)
189 |         var i = 0
190 |         var j = 0
191 |         var k = 0
192 |         while(j<self.dim2) {
193 |           k = 0
194 |           while(k<self.dim3) {
195 |             i = 0
196 |             val inner = new DenseTensor1(self.dim1)
197 |             while (i < self.dim1) {
198 |               inner.update(i,self(i,j,k))
199 |               i+=1
200 |             }
201 |             tensor2.update(j, k, tensor1 dot inner)
202 |             k+=1
203 |           }
204 |           j+=1
205 |         }
206 | 
207 |         //      for {
208 |         //        j <- 0 until self.dim2
209 |         //        k <- 0 until self.dim3
210 |         //      } {
211 |         //        val inner = new DenseTensor1((for (i <- 0 until self.dim1) yield self(i, j, k)).toArray)
212 |         //        tensor2.update(j, k, tensor1 dot inner)
213 |         //      }
214 |         tensor2
215 |     }
216 | 
217 |     def secondModeVectorProduct(tensor1: Tensor1): Tensor2 = {
218 |       require(self.dim2 == tensor1.dim1)
219 |       val tensor2 = new DenseTensor2(self.dim1, self.dim3)
220 |       var i = 0
221 |       var j = 0
222 |       var k = 0
223 |       while(i<self.dim1) {
224 |         k = 0
225 |         while(k<self.dim3) {
226 |           j = 0
227 |           val inner = new DenseTensor1(self.dim2)
228 |           while (j < self.dim2) {
229 |             inner.update(j, self(i,j,k))
230 |             j+=1
231 |           }
232 |           tensor2.update(i, k, tensor1 dot inner)
233 |           k+=1
234 |         }
235 |         i+=1
236 |       }
237 |       tensor2
238 |     }
239 | 
240 |     def thirdModeVectorProduct(tensor1: Tensor1): Tensor2 = {
241 |       require(self.dim3 == tensor1.dim1)
242 |       val tensor2 = new DenseTensor2(self.dim1, self.dim2)
243 |       var i = 0
244 |       var j = 0
245 |       var k = 0
246 |       while(i<self.dim1) {
247 |         j = 0
248 |         while(j<self.dim2) {
249 |           k = 0
250 |           val inner = new DenseTensor1(self.dim3)
251 |           while (k < self.dim3) {
252 |             inner.update(k, self(i,j,k))
253 |             k+=1
254 |           }
255 |           tensor2.update(i, j, tensor1 dot inner)
256 |           j+=1
257 |         }
258 |         i+=1
259 |       }
260 |       tensor2
261 |     }
262 | 
263 |     def *(tensor1: Tensor1): Tensor2 = firstModeVectorProduct(tensor1)
264 | 
265 |     def mul(value: Double): Tensor3 = (self * value).asInstanceOf[Tensor3]
266 | 
267 |     /**
268 |      * Multiplies a tensor by a matrix in mode n
269 |      * rockt: needed for tucker decomposition
270 |      */
271 |     def nModeMatrixProduct(mode: Int, tensor2 : Tensor2): Tensor3 = {
272 |       require(tensor2.dim2 == self.dimensions(mode))
273 |       val tensor3 = new DenseTensor3(???, ???, ???)
274 |       //TODO
275 |       tensor3
276 |     }
277 | 
278 | 
279 |     def update(i: Int, tensor2: Tensor2): Unit = update1(i, tensor2)
280 | 
281 |     //update in mode 1
282 |     def update1(i: Int, tensor2: Tensor2): Unit = {
283 |       require(self.dim2 == tensor2.dim1)
284 |       require(self.dim3 == tensor2.dim2)
285 |       for {
286 |         j <- 0 until self.dim2
287 |         k <- 0 until self.dim3
288 |       } self.update(i, j, k, tensor2(j, k))
289 |     }
290 | 
291 |     //updates in mode 2
292 |     def update2(j: Int, tensor2: Tensor2): Unit = {
293 |       require(self.dim1 == tensor2.dim1)
294 |       require(self.dim3 == tensor2.dim2)
295 |       for {
296 |         i <- 0 until self.dim1
297 |         k <- 0 until self.dim3
298 |       } self.update(i, j, k, tensor2(i, k))
299 |     }
300 | 
301 |     //updates in mode 3
302 |     def update3(k: Int, tensor2: Tensor2): Unit = {
303 |       require(self.dim1 == tensor2.dim1 && self.dim2 == tensor2.dim2,
304 |         s"can't update a ${self.dim1}×${self.dim2}×${self.dim3} tensor in mode 3 using a ${tensor2.dim1}×${tensor2.dim2} matrix")
305 | 
306 |       for {
307 |         i <- 0 until self.dim1
308 |         j <- 0 until self.dim2
309 |       } self.update(i, j, k, tensor2(i, j))
310 |     }
311 | 
312 | 
313 |     def matricization(mode: Int): Tensor2 = ??? //TODO
314 | 
315 |     //rockt: this is slightly different compared to Kolda et al. (2009)
316 |     def getFrontalSlice(i: Int): Tensor2 = {
317 |       val matrix = new DenseTensor2(self.dim2, self.dim3)
318 |       for {
319 |         j <- 0 until self.dim2
320 |         k <- 0 until self.dim3
321 |       } matrix update(j, k, self(i, j, k))
322 |       matrix
323 |     }
324 | 
325 |     def getHorizontalSlice(j: Int): Tensor2 = {
326 |       val matrix = new DenseTensor2(self.dim1, self.dim3)
327 |       for {
328 |         i <- 0 until self.dim1
329 |         k <- 0 until self.dim3
330 |       } matrix.update(i, k, self(i, j, k))
331 |       matrix
332 |     }
333 | 
334 |     def getLateralSlice(k: Int): Tensor2 = {
335 |       val matrix = new DenseTensor2(self.dim1, self.dim2)
336 |       for {
337 |         i <- 0 until self.dim1
338 |         j <- 0 until self.dim2
339 |       } matrix.update(i, j, self(i, j, k))
340 |       matrix
341 |     }
342 |   }
343 | 
344 |   implicit def tensorToTensor1(tensor: Tensor): Tensor1 = tensor match {
345 |     case tensor1: Tensor1 => tensor1
346 |     case tensor2: Tensor2 if tensor2.dim1 == 1 || tensor2.dim2 == 1 => new DenseTensor1(tensor2.asArray)
347 |     case _ => throw new scala.MatchError("I don't know how to transform this into a Tensor1: " + tensor)
348 |   }
349 | 
350 |   implicit def tensor1ToTensor2(tensor1: Tensor1): Tensor2 = {
351 |     val tensor2 = new DenseTensor2(tensor1.dim1, 1) {
352 |       _setArray(tensor1.asArray)
353 |     }
354 |     tensor2
355 |   }
356 | 
357 |   //TODO: speed this up!
358 |   def tensor3ToTensor2(tensor3: Tensor3): Tensor2 = {
359 |     require(tensor3.dim3 == 1)
360 |     val matrix = new DenseTensor2(tensor3.dim2, tensor3.dim1)
361 |     for {
362 |       i <- 0 until tensor3.dim1
363 |       j <- 0 until tensor3.dim2
364 |     } matrix update(j, i, tensor3(i, j, 0))
365 | 
366 |     matrix
367 |   }
368 | 
369 |   //TODO: speed this up!
370 |   def tensorToTensor3(tensor: Tensor): Tensor3 = tensor match {
371 |     case tensor2: Tensor2 =>
372 |       val tensor3 = new DenseTensor3(tensor2.dim2, tensor2.dim1, 1)
373 |       for {
374 |         i <- 0 until tensor2.dim1
375 |         j <- 0 until tensor2.dim2
376 |       } tensor3.update(j, i, 0, tensor2(i, j))
377 |       tensor3
378 |     case _ => throw new scala.MatchError("I don't know how to transform this into a Tensor3: " + tensor)
379 |   }
380 | 
381 |   def featureMatrixToTensor3(tensor: Tensor, featureDim: Int): Tensor3 = tensor match {
382 |     case tensor2: Tensor2 =>
383 |       //val numActive = tensor2.activeElements.size
384 |       val tensor3 = new SparseIndexedTensor3(featureDim + 1, tensor2.dim1, 1)
385 |       /*{
386 |         super.ensureCapacity(numActive)
387 |         override def ensureCapacity(cap: Int): Unit = true
388 |       }*/
389 |       //println("outer tensor: " + tensor2.toDimensionsString)
390 |       //println("new tensor: " + tensor3.toDimensionsString)
391 |       //println("outer: " + tensor2.toPrettyString)
392 |       for {
393 |         i <- tensor2.activeDomain1
394 |         j <- tensor2.activeDomain2
395 |       } {
396 |         //println(i,j)
397 |         tensor3.update(j, i, 0, tensor2(i, j))
398 |       }
399 |       tensor3
400 |   }
401 | 
402 |   val TENSOR3_ONE = {
403 |     val one = new DenseTensor3(1,1,1)
404 |     one update (0, 1.0)
405 |     one
406 |   }
407 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/ProbLogicEmbeddings.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import cc.factorie.la.DenseTensor1
  4 | import cc.factorie.model.WeightsSet
  5 | import cc.factorie.optimize._
  6 | import com.typesafe.config.Config
  7 | import ml.wolfe.Wolfe._
  8 | import ml.wolfe._
  9 | import uclmr.FactorizationUtil.Row
 10 | import ml.wolfe.fg.VectorMsgs
 11 | import ml.wolfe.util.Util
 12 | 
 13 | import scala.collection.mutable
 14 | import scala.util.Random
 15 | 
 16 | /**
 17 |  * @author Sebastian Riedel
 18 |  */
 19 | case class PredicateEmbedding(rel: String, embedding: FactorieVector,
 20 |                               scale: Double, bias: Double, weight: Double = 1.0,
 21 |                               observationFilter: String => Boolean = _ => true) {
 22 |   def distance(that: PredicateEmbedding) = {
 23 |     Util.sq(embedding.l2Similarity(that.embedding))
 24 |   }
 25 | }
 26 | 
 27 | case class ProbLogicEmbeddings(embeddings: Map[String, PredicateEmbedding],
 28 |                                rules: Rules = Rules(Map.empty, Map.empty),
 29 |                                average: Boolean = true, forceZero: Boolean = true,
 30 |                                usel2dist: Boolean = false, minWhenUsingL2Dist: Boolean = true) {
 31 | 
 32 | 
 33 |   def predict(observations: Seq[String], relation: String) = {
 34 |     embeddings.get(relation) match {
 35 |       case None => 0.0
 36 |       case Some(embedding) =>
 37 |         val filteredObs = observations.filter(embedding.observationFilter)
 38 |         if (!forceZero || filteredObs.size > 0) {
 39 |           val normalizer = if (average) filteredObs.size.toDouble else 1.0
 40 |           var score = embedding.bias
 41 |           if (!usel2dist) for (obs <- filteredObs; obsEmb <- embeddings.get(obs)) {
 42 |             score += obsEmb.weight * embedding.scale * (embedding.embedding dot obsEmb.embedding) / normalizer // observations.size
 43 |           } else {
 44 |             //take the average
 45 |             if (minWhenUsingL2Dist) {
 46 |               val distances = for (obs <- filteredObs.view; obsEmb <- embeddings.get(obs).view) yield {
 47 |                 Util.sq(obsEmb.embedding.l2Similarity(embedding.embedding))
 48 |               }
 49 |               score -= distances.min
 50 |             } else {
 51 |               val result = new DenseTensor1(embedding.embedding)
 52 |               for (obs <- filteredObs; obsEmb <- embeddings.get(obs)) {
 53 |                 result +=(obsEmb.embedding, -1.0 / normalizer) // observations.size
 54 |               }
 55 |               score -= result.twoNormSquared
 56 |             }
 57 |           }
 58 |           val result = Util.sig(score)
 59 |           result
 60 |         } else 0.0
 61 |     }
 62 |   }
 63 |   def predictRow(observation: Row, targets: Seq[String]) = {
 64 |     observation.copy(relations = targets.map(r => r -> predict(observation.observedTrue, r)))
 65 |   }
 66 | 
 67 |   def pairwiseRules(relPairs: Iterable[(String, String)]) = {
 68 |     val relations = embeddings.keys.toArray.sorted
 69 |     val marginals = (for (r <- relations) yield r -> predict(Seq.empty, r)).toMap
 70 |     val result = for ((rel1, rel2) <- relPairs;
 71 |                       emb1 = embeddings(rel1);
 72 |                       emb2 = embeddings(rel2)) yield {
 73 |       val prob1given2 = predict(Seq(rel2), rel1)
 74 |       val prob2given1 = predict(Seq(rel1), rel2)
 75 |       val prob1 = marginals(rel1)
 76 |       val prob2 = marginals(rel2)
 77 |       val probs = Map(
 78 |         (true, true) -> prob1given2 * prob2, //todo: this may be different to using the other way around
 79 |         (true, false) -> (1.0 - prob2given1) * prob1,
 80 |         (false, true) -> (1.0 - prob1given2) * prob2,
 81 |         (false, false) -> (1 - prob1) * (1 - prob2)
 82 |       )
 83 |       (rel1, rel2) -> Rule2(
 84 |         rel1, rel2, probs,
 85 |         trueTrueInconsistency = math.abs(prob1given2 * prob2 - prob2given1 * prob1),
 86 |         cond1given2 = prob1given2, cond2given1 = prob2given1)
 87 |     }
 88 |     result.toMap
 89 |   }
 90 | 
 91 | }
 92 | 
 93 | case class Rules(rules2: Map[(String, String), Rule2], rules1: Map[String, Rule1] = Map.empty) {
 94 |   lazy val rel2RuleArg1 = rules2.toSeq.groupBy(_._1._1) withDefaultValue Seq.empty
 95 |   lazy val rel2RuleArg2 = rules2.toSeq.groupBy(_._1._2) withDefaultValue Seq.empty
 96 | 
 97 |   lazy val relations = rules2.keySet.map(_._1) ++ rules2.keySet.map(_._2)
 98 | 
 99 |   def pairwiseRuleCount(rel: String) = rel2RuleArg1(rel).size + rel2RuleArg2(rel).size
100 | 
101 |   def +(that: Rules): Rules = {
102 |     val result = new mutable.HashMap[(String, String), Rule2]
103 |     for ((pair, r1) <- rules2) {
104 |       that.rules2.get(pair) match {
105 |         case Some(r2) => result(pair) = r1 + r2
106 |         case None => result(pair) = r1
107 |       }
108 |     }
109 |     for ((pair, r2) <- that.rules2) if (!result.contains(pair)) result(pair) = r2
110 |     copy(rules2 = result.toMap)
111 | 
112 |   }
113 | 
114 |   def withPriorCounts(priorCounts: Map[(Boolean, Boolean), Double]) = {
115 |     val normalizer = priorCounts.values.sum
116 |     val probs = priorCounts.mapValues(_ / normalizer)
117 |     val rule = Rule2("r1", "r2", probs, count = normalizer, cond1given2 = 0, cond2given1 = 0)
118 |     copy(rules2 = rules2.mapValues(_ + rule))
119 |   }
120 | 
121 | }
122 | 
123 | 
124 | case class Rule2(rel1: String, rel2: String, probs: Map[(Boolean, Boolean), Double], scale: Double = 1,
125 |                  count: Double = 1.0, trueTrueInconsistency: Double = 0.0, cond1given2: Double, cond2given1: Double) {
126 | 
127 |   assert(cond1given2 <= 1.0)
128 |   def marg1(b1: Boolean) = probs(b1, true) + probs(b1, false)
129 |   def marg2(b2: Boolean) = probs(true, b2) + probs(false, b2)
130 |   def prob2given1(b1: Boolean)(b2: Boolean) = probs(b1, b2) / marg1(b1)
131 |   def prob1given2(b2: Boolean)(b1: Boolean) = probs(b1, b2) / marg2(b2)
132 | 
133 |   def cooccurCount = count * probs(true, true)
134 |   override def toString =
135 |     s"""$rel1 $rel2  ${ if (trueTrueInconsistency > 0.0) "(" + trueTrueInconsistency.toString + ")" else "" }
136 |       |p(r1|r2) = ${ cond1given2 }
137 |       |p(r2|r1) = ${ cond2given1 }
138 |       |p(r1)    = ${ marg1(true) }
139 |       |p(r2)    = ${ marg2(true) }
140 |     """.stripMargin
141 | 
142 |   lazy val mutualInformation = {
143 |     probs(true, true) * math.log(probs(true, true) / (marg1(true) * marg2(true))) +
144 |     probs(true, false) * math.log(probs(true, false) / (marg1(true) * marg2(false))) +
145 |     probs(false, true) * math.log(probs(false, true) / (marg1(false) * marg2(true))) +
146 |     probs(false, false) * math.log(probs(false, false) / (marg1(false) * marg2(false)))
147 |   }
148 | 
149 |   def +(that: Rule2) = {
150 |     val newCount = count + that.count
151 |     def newProb(b1: Boolean, b2: Boolean) = (probs(b1, b2) * count + that.probs(b1, b2) * that.count) / newCount
152 |     val newProbs = Map(
153 |       (true, true) -> newProb(true, true),
154 |       (true, false) -> newProb(true, false),
155 |       (false, true) -> newProb(false, true),
156 |       (false, false) -> newProb(false, false)
157 |     )
158 |     copy(probs = newProbs, count = newCount)
159 |   }
160 | 
161 | 
162 |   def klTerm(p1: Double, p2: Double) = if (p1 == 0.0) 0.0 else p1 * math.log(p1 / p2)
163 | 
164 |   def prob1given2Inc(that: Rule2) = cond1given2 - that.cond1given2
165 | 
166 |   def condKL(that: Rule2) = {
167 |     klTerm(prob1given2(true)(true), that.prob1given2(true)(true)) +
168 |     klTerm(prob1given2(true)(false), that.prob1given2(true)(false)) +
169 |     klTerm(prob2given1(true)(true), that.prob2given1(true)(true)) +
170 |     klTerm(prob2given1(true)(false), that.prob2given1(true)(false))
171 |   }
172 | 
173 |   def kl(that: Rule2) = {
174 |     klTerm(probs(true, true), that.probs(true, true)) +
175 |     klTerm(probs(true, false), that.probs(true, false)) +
176 |     klTerm(probs(false, true), that.probs(false, true)) +
177 |     klTerm(probs(false, true), that.probs(false, false))
178 |   }
179 | 
180 | }
181 | case class Rule1(rel: String, prob: Double)
182 | 
183 | object RuleInjector {
184 |   def injectImplication(rule: Rule2, forward: Boolean = true): Rule2 = {
185 |     forward match {
186 |       case true =>
187 |         val probs = Map((true, true) -> (rule.probs(true, true) + rule.probs(true, false)), (true, false) -> 0.0)
188 |         rule.copy(probs = rule.probs ++ probs)
189 |       case false =>
190 |         val probs = Map((true, true) -> (rule.probs(true, true) + rule.probs(false, true)), (false, true) -> 0.0)
191 |         rule.copy(probs = rule.probs ++ probs)
192 |     }
193 |   }
194 | }
195 | 
196 | object ProbLogicEmbedder {
197 | 
198 |   def embed(rules: Rules)(implicit conf: Config): ProbLogicEmbeddings = {
199 | 
200 |     import ml.wolfe.FactorGraph.Node
201 | 
202 |     val random = new Random(0)
203 |     val relations = rules.rules2.values.flatMap(r => Seq(r.rel1, r.rel2)).distinct.sorted.toArray
204 |     val numRelations = relations.size
205 |     val fg = new FactorGraph
206 |     val k = conf.getInt("epl.relation-dim")
207 |     val regW = conf.getDouble("epl.reg-embed")
208 |     val regS = conf.getDouble("epl.reg-scale")
209 |     val regBias = conf.getDouble("epl.reg-bias")
210 |     val regMult = conf.getDouble("epl.reg-mult")
211 |     val doNormB = conf.getBoolean("epl.norm-b")
212 |     val scalePrior = conf.getDouble("epl.scale-prior")
213 |     val biasPrior = conf.getDouble("epl.bias-prior")
214 |     val multPrior = conf.getDouble("epl.mult-prior")
215 |     val weighTerms = conf.getBoolean("epl.weigh-terms")
216 |     val unitBall = conf.getBoolean("epl.unit-ball")
217 |     val l2dist = conf.getBoolean("epl.l2-dist")
218 | 
219 |     val maxMarg = rules.rules2.view.flatMap(t => Iterator(t._2.marg1(true), t._2.marg2(true))).max
220 | 
221 |     val V = relations.map(r => r -> fg.addVectorNode(k, r)).toMap
222 |     if (unitBall) for (n <- V.values) n.variable.asVector.unitVector = true
223 |     def emptyMap = Map.empty[String, Node] withDefaultValue null
224 |     val colScales = if (regS == Double.PositiveInfinity) emptyMap else relations.map(i => i -> fg.addVectorNode(1)).toMap
225 |     val colBiases = if (regBias == Double.PositiveInfinity) emptyMap else relations.map(i => i -> fg.addVectorNode(1)).toMap
226 |     val colMults = if (regMult == Double.PositiveInfinity) emptyMap else relations.map(i => i -> fg.addVectorNode(1)).toMap
227 | 
228 |     //initialize
229 |     for (n <- V.values; i <- 0 until k) n.variable.asVector.b(i) = random.nextGaussian() * 1.0
230 |     for (n <- colScales.values) n.variable.asVector.b(0) = scalePrior
231 |     for (n <- colBiases.values) n.variable.asVector.b(0) = biasPrior
232 |     for (n <- colMults.values) n.variable.asVector.b(0) = multPrior
233 | 
234 |     val numberOfTerms = numRelations * (numRelations - 1) / 2.0
235 |     val objNormalizer = 1.0 / numberOfTerms
236 | 
237 |     println("Building factor graph")
238 | 
239 |     var numJointFactors = 0
240 |     for (rel1Index <- 0 until relations.length; rel2Index <- rel1Index + 1 until relations.size) {
241 |       val rel1 = relations(rel1Index)
242 |       val rel2 = relations(rel2Index)
243 | 
244 |       val v1 = V(rel1)
245 |       val v2 = V(rel2)
246 | 
247 |       val s1 = colScales(rel1)
248 |       val s2 = colScales(rel2)
249 | 
250 |       val eta1 = colBiases(rel1)
251 |       val eta2 = colBiases(rel2)
252 | 
253 |       val m1 = colMults(rel1)
254 |       val m2 = colMults(rel2)
255 | 
256 |       val relNormalizer1 = rules.pairwiseRuleCount(rel1)
257 |       val relNormalizer2 = rules.pairwiseRuleCount(rel2)
258 | 
259 |       rules.rules2.get((rel1, rel2)) match {
260 |         case Some(rule) =>
261 |           if (!l2dist) fg.buildFactor(Seq(v1, eta1, s1, m1, v2, eta2, s2, m2))(_ map (_ => new VectorMsgs)) {
262 |             e => new JointPotential(
263 |               e(0), e(1), e(2), e(3),
264 |               e(4), e(5), e(6), e(7),
265 |               rule.prob1given2(true)(true), rule.prob2given1(true)(true),
266 |               rule.marg1(true), rule.marg2(true),
267 |               regW, regBias, regS, regMult,
268 |               biasPrior, scalePrior, multPrior,
269 |               1.0 / relNormalizer1, 1.0 / relNormalizer2,
270 |               if (weighTerms) rule.marg1(true) / maxMarg else 1.0,
271 |               if (weighTerms) rule.marg2(true) / maxMarg else 1.0)
272 |           } else
273 |             fg.buildFactor(Seq(v1, v2, eta1, eta2))(_ map (_ => new VectorMsgs)) {
274 |               e => new L2DistanceBasedPotential(
275 |                 e(0), e(1), e(2), e(3),
276 |                 rule.prob1given2(true)(true), rule.prob2given1(true)(true),
277 |                 1.0,
278 |                 regW, regBias, biasPrior,
279 |                 1.0 / relNormalizer1, 1.0 / relNormalizer2)
280 |             }
281 |           //if (numJointFactors == 0) PotentialDebugger.checkGradients(factor.potential, debug = true)
282 |           numJointFactors += 1
283 | 
284 |         case _ =>
285 |       }
286 |     }
287 | 
288 |     fg.build()
289 |     println(s"Optimizing... with ${ fg.factors.size } terms")
290 | 
291 |     val maxIterations = conf.getInt("epl.opt-iterations")
292 | 
293 | 
294 |     //        val step = new AdaGrad(conf.getDouble("epl.ada-rate")) with UnitBallProjection
295 |     val step = new AdaMira(conf.getDouble("epl.ada-rate")) with UnitBallProjection
296 |     //val step = new LBFGS() with UnitBallProjection
297 | 
298 | 
299 |     def trainer(weightsSet: WeightsSet) = conf.getString("epl.trainer") match {
300 |       case "batch" => new BatchTrainer(weightsSet, step, maxIterations)
301 |       case "online" => new OnlineTrainer(weightsSet, step, maxIterations)
302 |     }
303 | 
304 |     GradientBasedOptimizer(fg, trainer(_), step)
305 |     //GradientBasedOptimizer(fg, new BatchTrainer(_, new LBFGS(), maxIterations))
306 | 
307 |     //allowed observations for each predicate are only the relations we have seen together with the predicate
308 |     val allowed = new mutable.HashMap[String, mutable.HashSet[String]]()
309 |     for ((r1, r2) <- rules.rules2.keys) {
310 |       allowed.getOrElseUpdate(r1, new mutable.HashSet[String]) += r2
311 |       allowed.getOrElseUpdate(r2, new mutable.HashSet[String]) += r1
312 |     }
313 |     //val allPairs = rules.rules2.keySet.flatMap(p => Set(p, p.swap))
314 |     //val allowed = allPairs.groupBy(_._1).mapValues(_.map(_._2))
315 |     val embeddings = relations.map({ rel =>
316 |       rel -> PredicateEmbedding(rel,
317 |         V(rel).variable.asVector.b,
318 |         if (regS == Double.PositiveInfinity) scalePrior else colScales(rel).variable.asVector.b(0),
319 |         if (regBias == Double.PositiveInfinity) biasPrior else colBiases(rel).variable.asVector.b(0),
320 |         if (regMult == Double.PositiveInfinity) multPrior else colMults(rel).variable.asVector.b(0),
321 |         allowed(rel))
322 |     })
323 |     ProbLogicEmbeddings(embeddings.toMap, rules, usel2dist = l2dist)
324 |   }
325 | 
326 | }
327 | 
328 | 
329 | object RuleLearner {
330 |   def learn(rows: Seq[Row], priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0): Rules = {
331 |     val pairCounts = mutable.HashMap[(String, String), Int]() withDefaultValue 0
332 |     val singleCounts = mutable.HashMap[String, Int]() withDefaultValue 0
333 | 
334 |     for (row <- rows) {
335 |       val cells = row.relations
336 |       for (cell <- cells) singleCounts(cell._1) += 1
337 |       for (i <- 0 until cells.size; j <- i + 1 until cells.size) {
338 |         //todo: more sensible to sort relations here instead adding two versions.
339 |         pairCounts(cells(i)._1 -> cells(j)._1) += 1
340 |         pairCounts(cells(j)._1 -> cells(i)._1) += 1
341 |       }
342 |     }
343 | 
344 |     val relations = singleCounts.keys.toArray.sorted
345 |     val normalizer = rows.size.toDouble + priorCounts.values.sum
346 |     val rules2 = for (r1 <- 0 until relations.size; r2 <- r1 + 1 until relations.size) yield {
347 |       val rel1 = relations(r1)
348 |       val rel2 = relations(r2)
349 |       val pairCount = pairCounts((rel1, rel2))
350 |       val singleCount1 = singleCounts(rel1)
351 |       val singleCount2 = singleCounts(rel2)
352 |       val prob11 = (pairCount + priorCounts(true, true)) / normalizer
353 |       val prob10 = ((singleCount1 - pairCounts(rel1, rel2)) + priorCounts(true, false)) / normalizer
354 |       val prob01 = ((singleCount2 - pairCounts(rel1, rel2)) + priorCounts(false, true)) / normalizer
355 |       val prob00 = 1.0 - prob11 - prob10 - prob01
356 |       val probs = Map(
357 |         (true, true) -> prob11, (true, false) -> prob10,
358 |         (false, true) -> prob01, (false, false) -> prob00
359 |       )
360 |       (rel1, rel2) -> Rule2(rel1, rel2, probs, 1.0, count = normalizer,
361 |         cond1given2 = prob11 / (prob01 + prob11),
362 |         cond2given1 = prob11 / (prob10 + prob11))
363 |     }
364 |     val rules1 = for ((r, c) <- singleCounts) yield r -> Rule1(r, c / normalizer)
365 |     Rules(rules2.toMap, rules1.toMap)
366 |   }
367 | }
368 | 
369 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/SoftLogicPotentials.scala:
--------------------------------------------------------------------------------
  1 | package uclmr
  2 | 
  3 | import ml.wolfe.FactorGraph.Edge
  4 | import ml.wolfe._
  5 | import ml.wolfe.fg.{Regularization, Potential}
  6 | import ml.wolfe.util.Conf
  7 | 
  8 | /**
  9 |  * A potential for a formula containing two predicates
 10 |  * @param constEdge edge to variable that refers to a constant
 11 |  * @param predicate1Edge edge to first predicate
 12 |  * @param predicate2Edge edge to second predicate
 13 |  * @param target target
 14 |  * @param lambda regularization parameter
 15 |  * @author rockt
 16 |  */
 17 | abstract class Formula2Potential(constEdge: Edge, predicate1Edge: Edge, predicate2Edge: Edge, target: Double = 1.0,
 18 |                                  val lambda: Double = 0.0, weight: Double = 1.0) extends Potential with Regularization {
 19 |   def cVar   = constEdge.n.variable.asVector
 20 |   def p1Var  = predicate1Edge.n.variable.asVector
 21 |   def p2Var  = predicate2Edge.n.variable.asVector
 22 |   val cMsgs  = constEdge.msgs.asVector
 23 |   val p1Msgs = predicate1Edge.msgs.asVector
 24 |   val p2Msgs = predicate2Edge.msgs.asVector
 25 | 
 26 |   def sig(x: Double) = 1.0 / (1.0 + math.exp(-x))
 27 | 
 28 |   private def innerLossAndDirection(s: Double): (Double, Int) =
 29 |     if (target >= s) (1 + s - target, 1)
 30 |     else (1 + target - s, -1)
 31 | 
 32 |   override def valueForCurrentSetting(): Double = {
 33 |     val c = cVar.setting
 34 |     val p1 = p1Var.setting
 35 |     val p2 = p2Var.setting
 36 |     val p1c = sig(c dot p1)
 37 |     val p2c = sig(c dot p2)
 38 | 
 39 |     val s = F(p1c, p2c)
 40 | 
 41 |     val loss = innerLossAndDirection(s)._1
 42 |     math.log(loss) * weight + regLoss(c) + regLoss(p1) + regLoss(p2)
 43 |   }
 44 | 
 45 |   override def valueAndGradientForAllEdges(): Double = {
 46 |     val p1c = sig(cMsgs.n2f dot p1Msgs.n2f)
 47 |     val p2c = sig(cMsgs.n2f dot p2Msgs.n2f)
 48 | 
 49 |     val s = F(p1c, p2c)
 50 | 
 51 |     val (loss, dir) = innerLossAndDirection(s)
 52 | 
 53 |     val p1c_p1 = cMsgs.n2f * p1c * (1 - p1c)
 54 |     val p1c_c = p1Msgs.n2f * p1c * (1 - p1c)
 55 |     val p2c_p2 = cMsgs.n2f * p2c * (1 - p2c)
 56 |     val p2c_c = p2Msgs.n2f * p2c * (1 - p2c)
 57 | 
 58 |     p1Msgs.f2n = (calcF_p1(p1c_p1, p2c) * (1.0 / loss) * dir) * weight + regGradient(p1Msgs.n2f)
 59 |     p2Msgs.f2n = (calcF_p2(p2c_p2, p1c) * (1.0 / loss) * dir) * weight + regGradient(p2Msgs.n2f)
 60 |     cMsgs.f2n =
 61 |       if (Conf.getBoolean("mf.inject-rows"))
 62 |         (calcF_c(p2c_c, p1c, p1c_c, p2c) * (1.0 / loss) * dir) * weight + regGradient(cMsgs.n2f)
 63 |       else
 64 |         new SparseVector(cMsgs.n2f.length)
 65 | 
 66 | 
 67 |     math.log(loss) * weight + regLoss(cMsgs.n2f) + regLoss(p1Msgs.n2f) + regLoss(p2Msgs.n2f)
 68 |   }
 69 | 
 70 |   /**
 71 |    * Calculates the score of a formula F that contains two predicates p1, p2.
 72 |    * @param p1c score of [p1(c)]
 73 |    * @param p2c score of [p2(c)]
 74 |    * @return score of [F]
 75 |    */
 76 |   def F(p1c: Double, p2c: Double): Double
 77 |   /**
 78 |    * Calculates gradient of [p1] in formula F.
 79 |    * @param p1c_p1 gradient of [p1] in [p1(c)]
 80 |    * @param p2c score of [p2(c)]
 81 |    * @return gradient of [p1]       
 82 |    */
 83 |   def calcF_p1(p1c_p1: FactorieVector, p2c: Double): FactorieVector
 84 |   /**
 85 |    * Calculates gradient of [p2] in formula F.
 86 |    * @param p2c_p2 gradient of [p2] in [p2(c)]
 87 |    * @param p1c score of [p1(c)]
 88 |    * @return gradient of [p2]
 89 |    */
 90 |   def calcF_p2(p2c_p2: FactorieVector, p1c: Double): FactorieVector
 91 |   /**
 92 |    * Calculates gradient of [c] in formula F.
 93 |    * @param p2c_c gradient of [c] in [p2(c)]
 94 |    * @param p1c score of [p1(c)]
 95 |    * @param p1c_c gradient of [c] in [p1(c)]
 96 |    * @param p2c score of [p2(c)]
 97 |    * @return gradient of [c]
 98 |    */
 99 |   def calcF_c(p2c_c: FactorieVector, p1c: Double, p1c_c: FactorieVector, p2c: Double): FactorieVector
100 | }
101 | 
102 | 
103 | class ImplPotential(constEdge: Edge, pred1Edge: Edge, pred2Edge: Edge, target: Double = 1.0, override val lambda: Double = 0.0, weight: Double = 1.0)
104 | extends Formula2Potential(constEdge, pred1Edge, pred2Edge, target, lambda, weight) {
105 |   //[p₁(c) => p₂(c)] := [p₁(c)]*([p₂(c)] - 1) + 1
106 |   def F(p1c: Double, p2c: Double) = p1c * (p2c - 1) + 1
107 |   def calcF_p1(p1c_p1: FactorieVector, p2c: Double) = p1c_p1 * (p2c - 1)
108 |   def calcF_p2(p2c_p2: FactorieVector, p1c: Double) = p2c_p2 * p1c
109 |   def calcF_c(p2c_c: FactorieVector, p1c: Double, p1c_c: FactorieVector, p2c: Double) =
110 |     p2c_c * p1c + p1c_c * (p2c - 1)
111 | }
112 | 
113 | 
114 | class ImplNegPotential(constEdge: Edge, pred1Edge: Edge, pred2Edge: Edge, target: Double = 1.0, override val lambda: Double = 0.0, weight: Double = 1.0)
115 | extends Formula2Potential(constEdge, pred1Edge, pred2Edge, target, lambda, weight) {
116 |   //[p₁(c) => ¬p₂(c)] := [p₁(c)]*(-[p₂(c)]) + 1
117 |   def F(p1c: Double, p2c: Double) = p1c * -p2c + 1
118 |   def calcF_p1(p1c_p1: FactorieVector, p2c: Double) = p1c_p1 * -p2c
119 |   def calcF_p2(p2c_p2: FactorieVector, p1c: Double) = p2c_p2 * -p1c
120 |   def calcF_c(p2c_c: FactorieVector, p1c: Double, p1c_c: FactorieVector, p2c: Double) =
121 |     p2c_c * -p1c + p1c_c * -p2c
122 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/future/MatrixFactorization2.scala:
--------------------------------------------------------------------------------
  1 | package uclmr.future
  2 | 
  3 | import cc.factorie.la.DenseTensor1
  4 | import cc.factorie.optimize.{AdaGrad, BatchTrainer}
  5 | import ml.wolfe.fg20._
  6 | import ml.wolfe.util.Math._
  7 | 
  8 | import scala.util.Random
  9 | 
 10 | /**
 11 |  * @author rockt
 12 |  */
 13 | object MatrixFactorization2 extends App {
 14 |   val k = 5
 15 |   val cols = Array("r1", "r2", "r3", "r4", "r5").map(new VectVar(k, _))
 16 |   val rows = Array("e1", "e2", "e3", "e4", "e5").map(new VectVar(k, _))
 17 | 
 18 |   val initialState = new MapBasedState(
 19 |     (cols ++ rows).map(_ -> new DenseTensor1((0 until k).map(i => random.nextGaussian() * 0.1).toArray)).toMap
 20 |   )
 21 | 
 22 |   val data = Array(
 23 |     Array(1, 0, 1, 0, 1),
 24 |     Array(1, 1, 0, 0, 1),
 25 |     Array(0, 0, 0, 1, 0),
 26 |     Array(1, 0, 1, 0, 0),
 27 |     Array(1, 0, 0, 0, 1)
 28 |   )
 29 | 
 30 |   val potentials =
 31 |     (0 until rows.length).flatMap(r => {
 32 |       (0 until cols.length).collect {
 33 |         case c if data(r)(c) == 1 =>
 34 |           new FlatSum[Differentiable](Seq(
 35 |             new MFLogisticPotential(rows(r), cols(c)),
 36 |             new L2Regularization(0.01, rows(r), cols(c))
 37 |           )) with DifferentiableSum
 38 |       }
 39 |     })
 40 | 
 41 |   val stochasticPotentials =
 42 |     (0 until rows.length).flatMap(r => {
 43 |       (0 until cols.length).collect {
 44 |         case c if data(r)(c) == 1 =>
 45 |           def sampledRow = rows(random.nextInt(rows.length))
 46 | 
 47 |           new FlatSum[Differentiable](Seq(
 48 |             new MFLogisticPotential(sampledRow, cols(c), 0.0),
 49 |             new L2Regularization(0.01, sampledRow, cols(c))
 50 |           )) with DifferentiableSum
 51 | 
 52 | 
 53 |           //new MFLogisticPotential(sampledRow, cols(c), 0.0)
 54 |       }
 55 |     })
 56 | 
 57 |   val problem = Problem(potentials ++ stochasticPotentials)
 58 | 
 59 |   val optimizer = new GradientBasedOptimizer(problem)
 60 | 
 61 |   val result = optimizer.gradientBasedArgmax(new BatchTrainer(_, new AdaGrad(0.1), 100), init = initialState)
 62 | 
 63 | 
 64 |   print("\t")
 65 |   cols.foreach(c => print(c + "\t"))
 66 |   println()
 67 |   rows.foreach(r => {
 68 |     print(r.name + "\t")
 69 |     cols.foreach(c => print(sigmoid(result.state(r) dot result.state(c)) + "\t"))
 70 |     println()
 71 |   })
 72 | 
 73 | }
 74 | 
 75 | 
 76 | 
 77 | class MFLogisticPotential(rowVar: => VectVar, colVar: => VectVar, target: Double = 1.0)
 78 |   extends StatelessDifferentiable with StatelessScorer with VectPotential {
 79 | 
 80 |   override def vectVars: Array[VectVar] = Array(rowVar, colVar)
 81 | 
 82 |   private def innerLossAndDirection(s: Double): (Double, Int) =
 83 |     if (target >= s) (1 + s - target, 1)
 84 |     else (1 + target - s, -1)
 85 | 
 86 | 
 87 |   override def score(setting: Setting): Double = {
 88 |     val row = setting.vect(0)
 89 |     val col = setting.vect(1)
 90 | 
 91 |     val score = sigmoid(row dot col)
 92 | 
 93 |     val (loss, dir) = innerLossAndDirection(score)
 94 | 
 95 |     math.log(loss)
 96 |   }
 97 | 
 98 |   override def gradientAndValue(currentParameters: PartialSetting, gradient: Setting): Double = {
 99 |     val row = currentParameters.vect(0)
100 |     val col = currentParameters.vect(1)
101 | 
102 |     val score = sigmoid(row dot col)
103 | 
104 |     val (loss, dir) = innerLossAndDirection(score)
105 | 
106 |     gradient.vect(0) = col * (1.0 - loss) * dir
107 |     gradient.vect(1) = row * (1.0 - loss) * dir
108 | 
109 |     math.log(loss)
110 |   }
111 | }
112 | 
113 | /**
114 |  * λ * Σ_i ||v_i||²
115 |  */
116 | class L2Regularization(lambda: Double, vars: VectVar*) extends StatelessDifferentiable with StatelessScorer with VectPotential {
117 |   override def vectVars: Array[VectVar] = vars.toArray
118 | 
119 |   override def score(setting: Setting): Double =
120 |     if (lambda == 0) 0
121 |     else -lambda * setting.vect.map(v => v.twoNormSquared).sum
122 | 
123 |   override def gradientAndValue(currentParameters: PartialSetting, gradient: Setting): Double = {
124 |     if (lambda != 0)
125 |       (0 until vectVars.length).foreach(i => gradient.vect(i) = currentParameters.vect(i) * lambda * -2)
126 | 
127 |     score(currentParameters)
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/hack/MTShowcase.scala:
--------------------------------------------------------------------------------
  1 | package uclmr.hack
  2 | 
  3 | import java.io.FileWriter
  4 | 
  5 | import uclmr.Formula
  6 | import uclmr.Impl
  7 | import uclmr.ImplNeg
  8 | import uclmr.TensorKB
  9 | import uclmr.util.FormulaeExtractor._
 10 | import uclmr.{Formula, Impl, ImplNeg, TensorKB}
 11 | import ml.wolfe.util.ProgressBar
 12 | import ml.wolfe.util.ProgressBar
 13 | 
 14 | 
 15 | /**
 16 |  * Finds strongest connections between portuguese shallow textual patterns and english dependency path patterns.
 17 |  * @author rockt
 18 |  */
 19 | object MTShowcase extends App {
 20 |   type Rule = Formula
 21 |   type Entity = Any
 22 |   type Relation = Any
 23 |   type SPDB = TensorKB
 24 | 
 25 |   def formulaScore(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = {
 26 |     val rows = pairs.filter(argFilter).map(_.head)
 27 |     (rows.map(e => rule(e)).sum / rows.size, rows.size)
 28 |   }
 29 | 
 30 |   /**
 31 |    * Calculates the weight of the formula based on matrix factorization predictions on observed premises.
 32 |    */
 33 |   def formulaScoreMF(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = {
 34 |     val p1 = rule.predicates(0)
 35 |     val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) }
 36 | 
 37 |     //we only care about the score over true observed premises
 38 |     val filteredRows = rows.filter(argFilter).map(_.head).filter(e => {
 39 |       val cell = db.get(p1,e).get
 40 |       cell.train && cell.target == 1.0
 41 |     })
 42 | 
 43 |     (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size)
 44 |   }
 45 | 
 46 |   /**
 47 |    * Calculates the weight of the formula based on matrix factorization predictions.
 48 |    */
 49 |   def formulaScoreMFPredicted(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true, threshold: Double = 0.1, onlyUnobserved: Boolean = true)(implicit db: SPDB): (Double, Int) = {
 50 |     val p1 = rule.predicates(0)
 51 |     val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) }
 52 | 
 53 |     //we only care about the score over true predicted premises
 54 |     val filteredRows = rows.filter(argFilter).map(_.head).filter(e => db.prob(p1,e) >= threshold && (!onlyUnobserved || !db.get(p1, e).get.train))
 55 | 
 56 |     if (filteredRows.isEmpty) (1.0, 0) else (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size)
 57 |   }
 58 | 
 59 |   /**
 60 |    * Calculates the weight of the formula based on the training data.
 61 |    */
 62 |   def formulaScoreData(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = {
 63 |     val p1 = rule.predicates(0)
 64 |     val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) }
 65 | 
 66 |     //we only care about the score over true observed premises
 67 |     val filteredRows = rows.filter(argFilter).map(_.head).filter(e => {
 68 |       val cell = db.get(p1,e).get
 69 |       cell.train && cell.target == 1.0
 70 |     })
 71 | 
 72 |     (filteredRows.map(e => rule match {
 73 |       case Impl(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 0.0) 0.0 else 1.0
 74 |       case ImplNeg(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 1.0) 0.0 else 1.0
 75 |     }).sum / filteredRows.size, filteredRows.size)
 76 |   }
 77 | 
 78 |   def implScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 79 |     formulaScore(Impl(r1,r2), pairs)
 80 | 
 81 |   def implNegScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 82 |     formulaScore(ImplNeg(r1,r2), pairs)
 83 | 
 84 |   def implScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 85 |     formulaScoreMF(Impl(r1,r2), pairs)
 86 | 
 87 |   def implNegScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 88 |     formulaScoreMF(ImplNeg(r1,r2), pairs)
 89 | 
 90 |   def implScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 91 |     formulaScoreMF(Impl(r1,r2), pairs)
 92 | 
 93 |   def implNegScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 94 |     formulaScoreMF(ImplNeg(r1,r2), pairs)
 95 | 
 96 |   def implScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 97 |     formulaScoreData(Impl(r1,r2), pairs)
 98 | 
 99 |   def implNegScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
100 |     formulaScoreData(ImplNeg(r1,r2), pairs)
101 | 
102 | 
103 | 
104 | 
105 | 
106 |   implicit val db = new SPDB
107 | 
108 |   println("Loading db...")
109 |   db.deserialize(args.lift(0).getOrElse("wolfe-apps/data/out/bbc/serialized/"))
110 |   println(db.toInfoString)
111 | 
112 |   val premises = db.relations.filter(_.toString.startsWith("por"))
113 |   //val consequents = db.relations.filter(consequentFilter)
114 |   val consequents = db.relations.filter(_.toString.startsWith("eng"))
115 | 
116 |   val rows = db.trainCells.map(_.key2).distinct.map(List(_))
117 |   //.map { case (ei, ej) => List(ei, ej) }
118 | 
119 |   println("Generating formulae...")
120 |   val progressBar = new ProgressBar(consequents.size * premises.size, 1000)
121 |   progressBar.start()
122 | 
123 |   val potentialRules = for {
124 |     consequent <- consequents
125 |     premise <- premises
126 |     if premise != consequent
127 |   } yield {
128 |     val (scoreMF, numPremisesMF) = implScoreMF(premise, consequent, rows)
129 |     val (scoreData, _) = implScoreData(premise, consequent, rows)
130 |     progressBar.apply(consequent.toString)
131 |     (scoreMF, scoreData, numPremisesMF, premise, consequent)
132 |   }
133 | 
134 |   println()
135 |   println("Writing formulae...")
136 |   val ruleWriter = new FileWriter("wolfe-apps/data/formulae/mt.txt")
137 |   potentialRules
138 |   //.filter(_._2 >= 0.9)
139 |   .filter(_._3 >= 10)
140 |   .sortBy(-_._1)
141 |   //.sortBy(-_._2)
142 |   .take(100000)
143 |   .zipWithIndex
144 |   .foreach(z => {
145 |     val (t, ix) = z
146 |     ruleWriter.write("//%d\t%.2f\t%.2f\t%d\n".format(ix + 1, t._1, t._2, t._3))
147 |     ruleWriter.write(s"${t._4} => ${t._5}\n\n")
148 |   })
149 |   ruleWriter.close()
150 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/io/FigerPB.scala:
--------------------------------------------------------------------------------
  1 | package uclmr.io
  2 | 
  3 | import com.google.protobuf.CodedOutputStream._
  4 | import com.google.protobuf.ExtensionRegistryLite._
  5 | 
  6 | /**
  7 |  * Created by sameer on 11/16/14.
  8 |  */
  9 | object FigerPB {
 10 |   // Generated by ScalaBuff, the Scala Protocol Buffers compiler. DO NOT EDIT!
 11 |   // source: entity.proto
 12 | 
 13 |   final case class Mention (
 14 |                            `start`: Option[Int] = None,
 15 |                            `end`: Option[Int] = None,
 16 |                            `tokens`: collection.immutable.Seq[String] = Vector.empty[String],
 17 |                            `posTags`: collection.immutable.Seq[String] = Vector.empty[String],
 18 |                            `deps`: collection.immutable.Seq[Mention.Dependency] = Vector.empty[Mention.Dependency],
 19 |                            `entityName`: Option[String] = None,
 20 |                            `features`: collection.immutable.Seq[String] = Vector.empty[String],
 21 |                            `labels`: collection.immutable.Seq[String] = Vector.empty[String],
 22 |                            `sentid`: Option[Int] = None,
 23 |                            `fileid`: Option[String] = None
 24 |                            ) extends com.google.protobuf.GeneratedMessageLite
 25 |                                      with com.google.protobuf.MessageLite.Builder
 26 |                                      with net.sandrogrzicic.scalabuff.Message[Mention]
 27 |                                      with net.sandrogrzicic.scalabuff.Parser[Mention] {
 28 | 
 29 |     def setStart(_f: Int) = copy(`start` = Some(_f))
 30 |     def setEnd(_f: Int) = copy(`end` = Some(_f))
 31 |     def setTokens(_i: Int, _v: String) = copy(`tokens` = `tokens`.updated(_i, _v))
 32 |     def addTokens(_f: String) = copy(`tokens` = `tokens` :+ _f)
 33 |     def addAllTokens(_f: String*) = copy(`tokens` = `tokens` ++ _f)
 34 |     def addAllTokens(_f: TraversableOnce[String]) = copy(`tokens` = `tokens` ++ _f)
 35 |     def setPosTags(_i: Int, _v: String) = copy(`posTags` = `posTags`.updated(_i, _v))
 36 |     def addPosTags(_f: String) = copy(`posTags` = `posTags` :+ _f)
 37 |     def addAllPosTags(_f: String*) = copy(`posTags` = `posTags` ++ _f)
 38 |     def addAllPosTags(_f: TraversableOnce[String]) = copy(`posTags` = `posTags` ++ _f)
 39 |     def setDeps(_i: Int, _v: Mention.Dependency) = copy(`deps` = `deps`.updated(_i, _v))
 40 |     def addDeps(_f: Mention.Dependency) = copy(`deps` = `deps` :+ _f)
 41 |     def addAllDeps(_f: Mention.Dependency*) = copy(`deps` = `deps` ++ _f)
 42 |     def addAllDeps(_f: TraversableOnce[Mention.Dependency]) = copy(`deps` = `deps` ++ _f)
 43 |     def setEntityName(_f: String) = copy(`entityName` = Some(_f))
 44 |     def setFeatures(_i: Int, _v: String) = copy(`features` = `features`.updated(_i, _v))
 45 |     def addFeatures(_f: String) = copy(`features` = `features` :+ _f)
 46 |     def addAllFeatures(_f: String*) = copy(`features` = `features` ++ _f)
 47 |     def addAllFeatures(_f: TraversableOnce[String]) = copy(`features` = `features` ++ _f)
 48 |     def setLabels(_i: Int, _v: String) = copy(`labels` = `labels`.updated(_i, _v))
 49 |     def addLabels(_f: String) = copy(`labels` = `labels` :+ _f)
 50 |     def addAllLabels(_f: String*) = copy(`labels` = `labels` ++ _f)
 51 |     def addAllLabels(_f: TraversableOnce[String]) = copy(`labels` = `labels` ++ _f)
 52 |     def setSentid(_f: Int) = copy(`sentid` = Some(_f))
 53 |     def setFileid(_f: String) = copy(`fileid` = Some(_f))
 54 | 
 55 |     def clearStart = copy(`start` = None)
 56 |     def clearEnd = copy(`end` = None)
 57 |     def clearTokens = copy(`tokens` = Vector.empty[String])
 58 |     def clearPosTags = copy(`posTags` = Vector.empty[String])
 59 |     def clearDeps = copy(`deps` = Vector.empty[Mention.Dependency])
 60 |     def clearEntityName = copy(`entityName` = None)
 61 |     def clearFeatures = copy(`features` = Vector.empty[String])
 62 |     def clearLabels = copy(`labels` = Vector.empty[String])
 63 |     def clearSentid = copy(`sentid` = None)
 64 |     def clearFileid = copy(`fileid` = None)
 65 | 
 66 |     def writeTo(output: com.google.protobuf.CodedOutputStream) {
 67 |       if (`start`.isDefined) output.writeInt32(1, `start`.get)
 68 |       if (`end`.isDefined) output.writeInt32(2, `end`.get)
 69 |       for (_v <- `tokens`) output.writeString(3, _v)
 70 |       for (_v <- `posTags`) output.writeString(4, _v)
 71 |       for (_v <- `deps`) output.writeMessage(5, _v)
 72 |       if (`entityName`.isDefined) output.writeString(6, `entityName`.get)
 73 |       for (_v <- `features`) output.writeString(7, _v)
 74 |       for (_v <- `labels`) output.writeString(8, _v)
 75 |       if (`sentid`.isDefined) output.writeInt32(9, `sentid`.get)
 76 |       if (`fileid`.isDefined) output.writeString(10, `fileid`.get)
 77 |     }
 78 | 
 79 |     lazy val getSerializedSize = {
 80 |       import com.google.protobuf.CodedOutputStream._
 81 |       var __size = 0
 82 |       if (`start`.isDefined) __size += computeInt32Size(1, `start`.get)
 83 |       if (`end`.isDefined) __size += computeInt32Size(2, `end`.get)
 84 |       for (_v <- `tokens`) __size += computeStringSize(3, _v)
 85 |       for (_v <- `posTags`) __size += computeStringSize(4, _v)
 86 |       for (_v <- `deps`) __size += computeMessageSize(5, _v)
 87 |       if (`entityName`.isDefined) __size += computeStringSize(6, `entityName`.get)
 88 |       for (_v <- `features`) __size += computeStringSize(7, _v)
 89 |       for (_v <- `labels`) __size += computeStringSize(8, _v)
 90 |       if (`sentid`.isDefined) __size += computeInt32Size(9, `sentid`.get)
 91 |       if (`fileid`.isDefined) __size += computeStringSize(10, `fileid`.get)
 92 | 
 93 |       __size
 94 |     }
 95 | 
 96 |     def mergeFrom(in: com.google.protobuf.CodedInputStream, extensionRegistry: com.google.protobuf.ExtensionRegistryLite): Mention = {
 97 |       import com.google.protobuf.ExtensionRegistryLite.{getEmptyRegistry => _emptyRegistry}
 98 |       var __start: Option[Int] = `start`
 99 |       var __end: Option[Int] = `end`
100 |       val __tokens: collection.mutable.Buffer[String] = `tokens`.toBuffer
101 |       val __posTags: collection.mutable.Buffer[String] = `posTags`.toBuffer
102 |       val __deps: collection.mutable.Buffer[Mention.Dependency] = `deps`.toBuffer
103 |       var __entityName: Option[String] = `entityName`
104 |       val __features: collection.mutable.Buffer[String] = `features`.toBuffer
105 |       val __labels: collection.mutable.Buffer[String] = `labels`.toBuffer
106 |       var __sentid: Option[Int] = `sentid`
107 |       var __fileid: Option[String] = `fileid`
108 | 
109 |       def __newMerged = Mention(
110 |         __start,
111 |         __end,
112 |         Vector(__tokens: _*),
113 |         Vector(__posTags: _*),
114 |         Vector(__deps: _*),
115 |         __entityName,
116 |         Vector(__features: _*),
117 |         Vector(__labels: _*),
118 |         __sentid,
119 |         __fileid
120 |       )
121 |       while (true) in.readTag match {
122 |         case 0 => return __newMerged
123 |         case 8 => __start = Some(in.readInt32())
124 |         case 16 => __end = Some(in.readInt32())
125 |         case 26 => __tokens += in.readString()
126 |         case 34 => __posTags += in.readString()
127 |         case 42 => __deps += readMessage[Mention.Dependency](in, Mention.Dependency.defaultInstance, _emptyRegistry)
128 |         case 50 => __entityName = Some(in.readString())
129 |         case 58 => __features += in.readString()
130 |         case 66 => __labels += in.readString()
131 |         case 72 => __sentid = Some(in.readInt32())
132 |         case 82 => __fileid = Some(in.readString())
133 |         case default => if (!in.skipField(default)) return __newMerged
134 |       }
135 |       null
136 |     }
137 | 
138 |     def mergeFrom(m: Mention) = {
139 |       Mention(
140 |         m.`start`.orElse(`start`),
141 |         m.`end`.orElse(`end`),
142 |         `tokens` ++ m.`tokens`,
143 |         `posTags` ++ m.`posTags`,
144 |         `deps` ++ m.`deps`,
145 |         m.`entityName`.orElse(`entityName`),
146 |         `features` ++ m.`features`,
147 |         `labels` ++ m.`labels`,
148 |         m.`sentid`.orElse(`sentid`),
149 |         m.`fileid`.orElse(`fileid`)
150 |       )
151 |     }
152 | 
153 |     def getDefaultInstanceForType = Mention.defaultInstance
154 |     def clear = getDefaultInstanceForType
155 |     def isInitialized = true
156 |     def build = this
157 |     def buildPartial = this
158 |     def parsePartialFrom(cis: com.google.protobuf.CodedInputStream, er: com.google.protobuf.ExtensionRegistryLite) = mergeFrom(cis, er)
159 |     override def getParserForType = this
160 |     def newBuilderForType = getDefaultInstanceForType
161 |     def toBuilder = this
162 |     def toJson(indent: Int = 0): String = "ScalaBuff JSON generation not enabled. Use --generate_json_method to enable."
163 |   }
164 | 
165 |   object Mention {
166 |     @reflect.BeanProperty val defaultInstance = new Mention()
167 | 
168 |     def parseFrom(data: Array[Byte]): Mention = defaultInstance.mergeFrom(data)
169 |     def parseFrom(data: Array[Byte], offset: Int, length: Int): Mention = defaultInstance.mergeFrom(data, offset, length)
170 |     def parseFrom(byteString: com.google.protobuf.ByteString): Mention = defaultInstance.mergeFrom(byteString)
171 |     def parseFrom(stream: java.io.InputStream): Mention = defaultInstance.mergeFrom(stream)
172 |     def parseDelimitedFrom(stream: java.io.InputStream): Option[Mention] = defaultInstance.mergeDelimitedFromStream(stream)
173 | 
174 |     val START_FIELD_NUMBER = 1
175 |     val END_FIELD_NUMBER = 2
176 |     val TOKENS_FIELD_NUMBER = 3
177 |     val POS_TAGS_FIELD_NUMBER = 4
178 |     val DEPS_FIELD_NUMBER = 5
179 |     val ENTITY_NAME_FIELD_NUMBER = 6
180 |     val FEATURES_FIELD_NUMBER = 7
181 |     val LABELS_FIELD_NUMBER = 8
182 |     val SENTID_FIELD_NUMBER = 9
183 |     val FILEID_FIELD_NUMBER = 10
184 | 
185 |     def newBuilder = defaultInstance.newBuilderForType
186 |     def newBuilder(prototype: Mention) = defaultInstance.mergeFrom(prototype)
187 | 
188 |     final case class Dependency (
189 |                                 `type`: Option[String] = None,
190 |                                 `gov`: Option[Int] = None,
191 |                                 `dep`: Option[Int] = None
192 |                                 ) extends com.google.protobuf.GeneratedMessageLite
193 |                                           with com.google.protobuf.MessageLite.Builder
194 |                                           with net.sandrogrzicic.scalabuff.Message[Dependency]
195 |                                           with net.sandrogrzicic.scalabuff.Parser[Dependency] {
196 | 
197 |       def setType(_f: String) = copy(`type` = Some(_f))
198 |       def setGov(_f: Int) = copy(`gov` = Some(_f))
199 |       def setDep(_f: Int) = copy(`dep` = Some(_f))
200 | 
201 |       def clearType = copy(`type` = None)
202 |       def clearGov = copy(`gov` = None)
203 |       def clearDep = copy(`dep` = None)
204 | 
205 |       def writeTo(output: com.google.protobuf.CodedOutputStream) {
206 |         if (`type`.isDefined) output.writeString(1, `type`.get)
207 |         if (`gov`.isDefined) output.writeInt32(2, `gov`.get)
208 |         if (`dep`.isDefined) output.writeInt32(3, `dep`.get)
209 |       }
210 | 
211 |       lazy val getSerializedSize = {
212 |         import com.google.protobuf.CodedOutputStream._
213 |         var __size = 0
214 |         if (`type`.isDefined) __size += computeStringSize(1, `type`.get)
215 |         if (`gov`.isDefined) __size += computeInt32Size(2, `gov`.get)
216 |         if (`dep`.isDefined) __size += computeInt32Size(3, `dep`.get)
217 | 
218 |         __size
219 |       }
220 | 
221 |       def mergeFrom(in: com.google.protobuf.CodedInputStream, extensionRegistry: com.google.protobuf.ExtensionRegistryLite): Dependency = {
222 |         import com.google.protobuf.ExtensionRegistryLite.{getEmptyRegistry => _emptyRegistry}
223 |         var __type: Option[String] = `type`
224 |         var __gov: Option[Int] = `gov`
225 |         var __dep: Option[Int] = `dep`
226 | 
227 |         def __newMerged = Dependency(
228 |           __type,
229 |           __gov,
230 |           __dep
231 |         )
232 |         while (true) in.readTag match {
233 |           case 0 => return __newMerged
234 |           case 10 => __type = Some(in.readString())
235 |           case 16 => __gov = Some(in.readInt32())
236 |           case 24 => __dep = Some(in.readInt32())
237 |           case default => if (!in.skipField(default)) return __newMerged
238 |         }
239 |         null
240 |       }
241 | 
242 |       def mergeFrom(m: Dependency) = {
243 |         Dependency(
244 |           m.`type`.orElse(`type`),
245 |           m.`gov`.orElse(`gov`),
246 |           m.`dep`.orElse(`dep`)
247 |         )
248 |       }
249 | 
250 |       def getDefaultInstanceForType = Dependency.defaultInstance
251 |       def clear = getDefaultInstanceForType
252 |       def isInitialized = true
253 |       def build = this
254 |       def buildPartial = this
255 |       def parsePartialFrom(cis: com.google.protobuf.CodedInputStream, er: com.google.protobuf.ExtensionRegistryLite) = mergeFrom(cis, er)
256 |       override def getParserForType = this
257 |       def newBuilderForType = getDefaultInstanceForType
258 |       def toBuilder = this
259 |       def toJson(indent: Int = 0): String = "ScalaBuff JSON generation not enabled. Use --generate_json_method to enable."
260 |     }
261 | 
262 |     object Dependency {
263 |       @reflect.BeanProperty val defaultInstance = new Dependency()
264 | 
265 |       def parseFrom(data: Array[Byte]): Dependency = defaultInstance.mergeFrom(data)
266 |       def parseFrom(data: Array[Byte], offset: Int, length: Int): Dependency = defaultInstance.mergeFrom(data, offset, length)
267 |       def parseFrom(byteString: com.google.protobuf.ByteString): Dependency = defaultInstance.mergeFrom(byteString)
268 |       def parseFrom(stream: java.io.InputStream): Dependency = defaultInstance.mergeFrom(stream)
269 |       def parseDelimitedFrom(stream: java.io.InputStream): Option[Dependency] = defaultInstance.mergeDelimitedFromStream(stream)
270 | 
271 |       val TYPE_FIELD_NUMBER = 1
272 |       val GOV_FIELD_NUMBER = 2
273 |       val DEP_FIELD_NUMBER = 3
274 | 
275 |       def newBuilder = defaultInstance.newBuilderForType
276 |       def newBuilder(prototype: Dependency) = defaultInstance.mergeFrom(prototype)
277 | 
278 |     }
279 |   }
280 | 
281 |   object EntityProtos {
282 |     def registerAllExtensions(registry: com.google.protobuf.ExtensionRegistryLite) {
283 |     }
284 | 
285 |   }
286 | 
287 | }
288 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/io/MatrixFilter.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.io
 2 | 
 3 | import java.io.FileWriter
 4 | 
 5 | import uclmr.TensorKB
 6 | import ml.wolfe.util.Conf
 7 | 
 8 | /**
 9 |  * Reads in a huge sparse matrix and filters it.
10 |  * Can also be used to add more data to the matrix (e.g. freebase facts).
11 |  *
12 |  * args0: input matrix
13 |  * args1: output matrix
14 |  * args2-: additional freebase relations
15 |  * @author rockt
16 |  */
17 | object MatrixFilter extends App {
18 |   val filePath = args.lift(0).getOrElse("./data/bbc/matrix_multi_all.txt")
19 |   println("Loading...")
20 |   val kb = LoadTSV(filePath = filePath)
21 |   println(kb.toInfoString)
22 | 
23 |   val matricesToAdd = if (args.size > 2) args.tail.tail else Array(
24 |     "./data/bbc/matrix_freebase.txt"
25 |   )
26 |   println("Loading additional data...")
27 |   matricesToAdd.foreach(fileName => LoadTSV(db = kb, filePath = fileName))
28 |   println(kb.toInfoString)
29 | 
30 |   println("Filtering...")
31 |   val filteredKB = new TensorKB()
32 | 
33 |   val frequentRows = kb.keys2.filter(key2 => kb.getBy2(key2).size > 10).toSet
34 |   val frequentCols = kb.keys1.filter(key1 => kb.getBy1(key1).size > 25).toSet
35 | 
36 |   val filteredCells =
37 |     kb.cells.filter(c => frequentCols(c.key1) && frequentRows(c.key2))
38 |     .foreach(cell => filteredKB += cell)
39 | 
40 |   println(filteredKB.toInfoString)
41 | 
42 |   val fileWriter = new FileWriter(args.lift(1).getOrElse("./data/bbc/matrix_final.txt"))
43 |   filteredKB.cells.foreach(cell => {
44 |     val (e1, e2) = cell.key2
45 |     fileWriter.write(s"${cell.key1}\t$e1\t$e2\t${cell.cellType}\t${cell.target}\n")
46 |   })
47 |   fileWriter.close()
48 | }
49 | 
50 | /**
51 |  * Shows stats about the matrix, e.g., what freebase relations are in there.
52 |  */
53 | object MatrixInspector extends App {
54 |   val kb = LoadTSV(filePath = args.lift(0).getOrElse("./data/bbc/matrix_final.txt"))
55 |   println(kb.toInfoString)
56 | 
57 |   val freebaseRelations = kb.keys1.filter(_.toString.startsWith("REL$"))
58 |   freebaseRelations.foreach(println)
59 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/io/TSV.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.io
 2 | 
 3 | import uclmr.{DefaultIx, Cell, CellType, TensorKB}
 4 | import ml.wolfe.util.{ProgressBar, Conf}
 5 | 
 6 | import scala.io.Source
 7 | import scala.util.Random
 8 | 
 9 | /**
10 |  * @author rockt
11 |  */
12 | object LoadTSV extends App {
13 |   def apply(k: Int = 100, subsample: Double = 1.0, db: TensorKB = null, filePath: String = Conf.getString("inputFile")): TensorKB = {
14 |     val kb = if (db != null) db else new TensorKB(k)
15 |     val rand = new Random(0l)
16 | 
17 |     val lines = Source.fromFile(filePath).getLines()
18 | 
19 |     val progressBar = new ProgressBar(Source.fromFile(filePath).getLines().size, 100000)
20 |     progressBar.start()
21 | 
22 |     for {
23 |       fact <- lines
24 |       Array(r, e1, e2, typ, target) = fact.split("\t")
25 |     } {
26 |       val cellType = typ match {
27 |         case "Train" => CellType.Train
28 |         case "Test" => CellType.Test
29 |         case "Dev" => CellType.Dev
30 |         case "Observed" => CellType.Observed
31 |       }
32 | 
33 |       if (rand.nextDouble() < subsample) {
34 |         val cell = Cell(r, (e1, e2), DefaultIx, target.toDouble, cellType)
35 |         kb += cell
36 |       }
37 | 
38 |       progressBar(r)
39 |     }
40 | 
41 |     kb
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/ArgMaxSigmoid.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.util
 2 | 
 3 | import cc.factorie.la.DenseTensor1
 4 | import cc.factorie.optimize.{AdaGrad, OnlineTrainer, BatchTrainer}
 5 | import ml.wolfe.fg.{L2Regularization, CellLogisticLoss, VectorMsgs}
 6 | import ml.wolfe.util.ProgressLogging
 7 | import ml.wolfe.{GradientBasedOptimizer, FactorieVector}
 8 | import uclmr.{Cell, TensorDB}
 9 | 
10 | import scala.util.Random
11 | 
12 | /**
13 |  * @author rockt
14 |  */
15 | object ArgMaxSigmoid extends App {
16 |   /**
17 |    * Probably the most expensive way to find the argmax of the sigmoid of a dot product in history of optimization.
18 |    * Anyway, finds the the vector that maximizes the sigmoid of the dot product of a given vector and target value.
19 |    * @param vec a given vector
20 |    * @param target target value
21 |    * @return argmax_vec* σ(vec • vec*) = target
22 |    */
23 |   def apply(vec: FactorieVector, target: Double = 1.0, lambda: Double = 0.01): FactorieVector = {
24 |     val db = new TensorDB(vec.length)
25 | 
26 |     db += Cell("vec1", "vec2")
27 | 
28 |     val fg = db.toFactorGraph
29 | 
30 |     val vec1Node = db.node1("vec1").get
31 |     val vec2Node = db.node2("vec2").get
32 | 
33 |     fg.buildFactor(Seq(vec2Node, vec1Node))(_ map (_ => new VectorMsgs)) {
34 |       e => new CellLogisticLoss(e(0), e(1), target, lambda, 1.0, false) with L2Regularization
35 |     }
36 | 
37 |     fg.build()
38 | 
39 |     vec1Node.variable.asVector.b = vec
40 | 
41 |     GradientBasedOptimizer(fg, new OnlineTrainer(_, new AdaGrad(rate = 1.0), 1000, 1) with ProgressLogging)
42 | 
43 |     vec2Node.variable.asVector.b
44 |   }
45 | 
46 |   val rand = new Random(0l)
47 | 
48 |   val col = new DenseTensor1((0 until 100).map(i => rand.nextGaussian() * 0.1).toArray)
49 | 
50 |   println(col)
51 | 
52 |   val row = ArgMaxSigmoid(col)
53 | 
54 | 
55 |   def sig(x: Double) = 1.0 / (1.0 + math.exp(-x))
56 | 
57 |   println("vec1:           " + col.mkString("\t"))
58 |   println("vec2:           " + row.mkString("\t"))
59 |   println("σ(vec1 • vec2): " + sig(col dot row))
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/DataInspector.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.util
 2 | 
 3 | import java.io.FileWriter
 4 | 
 5 | import uclmr.TensorDB
 6 | import uclmr.io.LoadNAACL
 7 | import ml.wolfe.util.Conf
 8 | 
 9 | import scala.util.Random
10 | 
11 | /**
12 |  * @author rockt
13 |  */
14 | object DataInspector extends App {
15 |   Conf.add(args.lift(0).getOrElse("./conf/mf.conf"))
16 | 
17 |   /*
18 |   val db = LoadNAACL()
19 |   println(db.toInfoString)
20 |  
21 |   println(db.trainCells.count(_.key1.toString.startsWith("REL$")))
22 | 
23 |   Mentions.load()
24 |   val pathToMentionsMap = Mentions.pathToMentions
25 | 
26 |   val paths = Seq(
27 |     "path#nn|<-nn<-unit->prep->of->pobj->|pobj:INV",
28 |     "path#appos|->appos->producer->dep->|dep:INV",
29 |     "path#nsubj|<-nsubj<-city->prep->in->pobj->|pobj",
30 |     "path#pobj|<-pobj<-to<-prep<-move->prep->to->pobj->|pobj:INV"
31 |   )
32 | 
33 |   paths.foreach(p => {
34 |     println(p)
35 |     println(pathToMentionsMap(p).mkString("\n"))
36 |     println()
37 |   })
38 |   */
39 | 
40 |   val lengthsWriter = new FileWriter("./data/eval/lengths.txt")
41 | 
42 |   val formulaePredicates = {
43 |     val db2 = LoadNAACL()
44 |     db2.formulae.map(f => f.predicates(0) -> f.predicates(1))
45 |   }
46 |   
47 |   def writeLenghts(pathToDB: String, sample: Boolean, label: String) {
48 |     val db = new TensorDB(100)
49 | 
50 |     db.deserialize(pathToDB)
51 | 
52 |     val rand = new Random(0l)
53 | 
54 |     val numSamples = 1000
55 | 
56 |     val pairs = 
57 |       if (sample) 
58 |         for (i <- 0 until numSamples) yield {
59 |           val premise = db.keys1(rand.nextInt(db.keys1.size))
60 |           val consequent = db.keys1(rand.nextInt(db.keys1.size))
61 |           (premise, consequent)
62 |         }
63 |       else 
64 |         formulaePredicates
65 | 
66 |     //println(pairs.size)
67 | 
68 |     def key1ToLength(key1: Any): Double = db.node1(key1).get.variable.asVector.b.twoNorm
69 | 
70 | 
71 |     pairs.foreach(p => {
72 |       val (premise, consequent) = p
73 |       lengthsWriter.write((key1ToLength(consequent) - key1ToLength(premise)).toString + "\t" + label + "\n")
74 |     })
75 |   }
76 | 
77 |   writeLenghts("data/out/F/serialized/", true, "mf-sample")
78 |   writeLenghts("data/out/F/serialized/", false, "mf-formulae")
79 |   writeLenghts("data/out/F-Pre/serialized/", false, "pre-formulae")
80 |   writeLenghts("data/out/F-formulae-100/serialized/", false, "joint-formulae")
81 | 
82 |   lengthsWriter.close()
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/FormulaeAnnotator.scala:
--------------------------------------------------------------------------------
  1 | package uclmr.util
  2 | 
  3 | import java.io.FileWriter
  4 | 
  5 | import ml.wolfe.util.ANSIFormatter._
  6 | import ml.wolfe.util.ProgressBar
  7 | 
  8 | import scala.collection.mutable.ArrayBuffer
  9 | import scala.io.Source
 10 | import scala.util.Random
 11 | 
 12 | 
 13 | /**
 14 |  * @author rockt
 15 |  */
 16 | 
 17 | object Mentions {  
 18 |   val pathToMentions = new collection.mutable.HashMap[String, ArrayBuffer[String]]
 19 | 
 20 |   def load(filePath: String = "./data/naacl2013/nyt-freebase.test.mentions.txt"): Unit = {
 21 |     println("Loading sentences for dependency paths...")
 22 | 
 23 |     val progressBar = new ProgressBar(Source.fromFile(filePath, "iso-8859-1").getLines().size, 100000)
 24 |     progressBar.start()
 25 | 
 26 |     val lines = Source.fromFile(filePath, "iso-8859-1").getLines()
 27 |     lines.foreach(line => {
 28 |       if (!line.isEmpty && !line.startsWith("#Document")) {
 29 |         val splits = line.split("\t")
 30 |         val label = splits(0)
 31 |         val arg1 = splits(1)
 32 |         val arg2 = splits(2)
 33 |         val typ = splits(3)
 34 | 
 35 |         val path = splits.find(_.startsWith("path#")).get
 36 | 
 37 |         var sentence = splits.find(_.startsWith("sen#")).get.drop(4)
 38 |                        .replaceAllLiterally(arg1, arg1.onBlue())
 39 |                        .replaceAllLiterally(arg2, arg2.onRed())
 40 | 
 41 |         pathToMentions.getOrElseUpdate(path, new ArrayBuffer[String]()) += sentence
 42 |       }
 43 |       progressBar()
 44 |     })
 45 |   }
 46 | }
 47 | 
 48 | object Action extends Enumeration {
 49 |   type Answer = Value
 50 |   val No, Yes, Unsure, Opposite, More, Quit, Undefined = Value
 51 | }
 52 | 
 53 | object FormulaeAnnotator extends App {
 54 |   import uclmr.util.Action._
 55 | 
 56 |   val filePath = args.lift(0).getOrElse("./data/formulae/1000.txt")
 57 |   val reannotate = args.lift(1).getOrElse("false").toBoolean
 58 |   val skipUntil = args.lift(2).getOrElse("0").toInt
 59 |   val rhsFilter: String => Boolean = if (args.size > 3) s => !s.endsWith(args(3)) else s => true
 60 | 
 61 |   println(s"Creating backup at $filePath.old...")
 62 |   val backup = new FileWriter(filePath + ".old")
 63 |   backup.write(Source.fromFile(filePath, "iso-8859-1").getLines().mkString("\n"))
 64 |   backup.close()
 65 | 
 66 |   val rand = new Random(0l)
 67 |   val fileWriter = new FileWriter(filePath + ".tmp")
 68 | 
 69 |   Mentions.load()
 70 |   val text = Source.fromFile(filePath, "iso-8859-1").getLines().mkString("\n")
 71 |   val formulae = text.split("\n\n")
 72 |   val newFile = new FileWriter(filePath)
 73 |   var quit = false
 74 | 
 75 |   formulae.foreach(formulaDatum => {
 76 |     val Array(statsTmp, formula) = formulaDatum.split("\n")
 77 | 
 78 |     val isCurated = statsTmp.endsWith("curated")
 79 |     val stats = if (isCurated) statsTmp.dropRight(8) else statsTmp
 80 |     val ix = stats.drop(2).split("\t")(0).toInt
 81 | 
 82 | 
 83 |     if (quit || (!reannotate && isCurated) || ix < skipUntil || !rhsFilter(formula)) {
 84 |       //ignore already curated formulae or the rest of the formulae in case we quit annotation
 85 |       fileWriter.write(formulaDatum + "\n\n")
 86 |     } else {
 87 |       //otherwise start annotation
 88 |       //currently only supports implications
 89 | 
 90 |       val Array(lhsTmp, arrow, rhsTmp) = formula.split(" ")
 91 | 
 92 |       val isCommentedOut = lhsTmp.startsWith("//")
 93 |       val isNegated = rhsTmp.startsWith("!")
 94 | 
 95 |       val lhs = if (isCommentedOut) lhsTmp.drop(2) else lhsTmp
 96 |       val rhs = if (isNegated) rhsTmp.drop(1) else rhsTmp
 97 | 
 98 |       val sentences = Mentions.pathToMentions.get(lhs)
 99 |                       .map(s => rand.shuffle(s)).getOrElse(List())
100 | 
101 |       println()
102 |       println(stats)
103 | 
104 |       var path = lhs.drop(5)
105 | 
106 |       val pathWords = path
107 |                       .split("\\|")(1)
108 |                       .split("<-|->")
109 |                       .zipWithIndex
110 |                       .filter(_._2 % 2 == 0)
111 |                       .map(_._1)
112 |                       .toList
113 |                       .filterNot(_.isEmpty)
114 |                       .map(_.trim)
115 | 
116 |       for (w <- pathWords)
117 |         path = path.replaceAllLiterally(w, w.onYellow())
118 | 
119 |       val rel = rhs.split("/").last
120 | 
121 |       println(path + " => " + rhs.replaceAllLiterally(rel, rel.onMagenta()))
122 |       if (reannotate && isCurated)
123 |         println("currently".onCyan() + ": " +
124 |         (if (isCommentedOut) "//".onRed() else "") +
125 |         "A" + " => " + (if (isNegated) "!".onRed() else "") + "B"
126 |         )
127 | 
128 | 
129 |       var ix = 0
130 |       var answer: Answer = Undefined
131 | 
132 |       while (answer == Undefined) {
133 |         var examples = sentences.slice(ix, ix+10).toList
134 |         for (w <- pathWords.filter(_.size > 2)) {
135 |           examples = examples.map(s => s.replaceAllLiterally(w, w.yellow()))
136 |         }
137 | 
138 |         examples.foreach(s => println(s"\t$s"))
139 |         ix += 10
140 | 
141 |         print("\nAdd this rule [y/n/u], the opposite [o] or show more mentions [m]: \r")
142 | 
143 |         answer = stringToAnswer(readLine())
144 |         while (answer == Undefined) {
145 |           print("Please answer with [y/n/o/u/m/quit]! \r")
146 |           answer = stringToAnswer(readLine())
147 |         }
148 | 
149 |         answer match {
150 |           case Yes =>
151 |             fileWriter.write(s"$stats\tcurated\n")
152 |             fileWriter.write(s"$lhs => $rhs\n\n")
153 |           case No =>
154 |             fileWriter.write(s"$stats\tcurated\n")
155 |             fileWriter.write(s"//$lhs => $rhs\n\n")
156 |           case Opposite =>
157 |             fileWriter.write(s"$stats\tcurated\n")
158 |             fileWriter.write(s"$lhs => !$rhs\n\n")
159 |           case Unsure =>
160 |             fileWriter.write(s"$stats\n")
161 |             fileWriter.write(s"//$lhs => $rhs\n\n")
162 |           case More =>
163 |             answer = Undefined
164 |           case Quit =>
165 |             fileWriter.write(formulaDatum + "\n\n")
166 |             quit = true
167 |         }
168 |       }
169 |     }
170 |   })
171 | 
172 |   fileWriter.close()
173 | 
174 |   def stringToAnswer(string: String): Answer = string.toLowerCase.trim match {
175 |     case "y" | "yes" => Yes
176 |     case "n" | "no" => No
177 |     case "o" | "opposite" => Opposite
178 |     case "u" | "unsure" => Unsure
179 |     case "m" | "more" => More
180 |     case "quit" | "exit" => Quit
181 |     case _ => Undefined
182 |   }
183 |   newFile.write(Source.fromFile(filePath + ".tmp", "iso-8859-1").getLines().mkString("\n"))
184 |   newFile.close()
185 | }
186 | 
187 | /*
188 | object CompareRanks extends App {
189 |   import RuleFinder.loadDB
190 | 
191 |   val db1Path = if (args.size > 0) args(0) else "./out/vectorland-F/serialized/"
192 |   val db2Path = if (args.size > 1) args(1) else "./out/latest/serialized/" //"./out/vectorland-F-rules-100/serialized/"
193 | 
194 |   val rulesFile = if (args.size > 2) args(2) else db2Path+"rules.txt"
195 | 
196 |   val implRhsToLhsMap = Source.fromFile(rulesFile, "iso-8859-1").getLines().toList.map(_.split(" => "))
197 |                         .collect { case Array(lhs,rhs) if !lhs.startsWith("//") && !rhs.startsWith("!") => (lhs, rhs) }
198 |                         .groupBy(_._2).mapValues(l => l.map(_._1))
199 | 
200 |   val implNegRhsToLhsMap = Source.fromFile(rulesFile, "iso-8859-1").getLines().toList.map(_.split(" => "))
201 |                            .collect { case Array(lhs,rhs) if !lhs.startsWith("//") && rhs.startsWith("!") => (lhs, rhs.drop(1)) }
202 |                            .groupBy(_._2).mapValues(l => l.map(_._1))
203 | 
204 |   //println("A => B")
205 |   //println(implRhsToLhsMap.mkString("\n"))
206 | 
207 |   //println("A => !B")
208 |   //println(implNegRhsToLhsMap.mkString("\n"))
209 | 
210 |   val db1 = loadDB(db1Path)
211 |   val db2 = loadDB(db2Path)
212 | 
213 |   def printChanges(formulaMap: Map[String, List[String]]) = {
214 |     val changeInImplRanks = toChangeInRanksAndP(formulaMap)
215 |     val changeInImplFormulae = toChangeInFormulaeSimAndScore(formulaMap)
216 |     for {
217 |       key <- changeInImplRanks.keys
218 |       (rankChange, rankImproved, pChange) = changeInImplRanks(key)
219 |       (simChange, scoreChange, scoreObsChange) = changeInImplFormulae(key)
220 |     } println(key + "\n\trank: %8.2f\tups: %5.2f\tp: %5.2f\tscore: %5.2f\tsim: %5.2f"
221 |                     .format(rankChange, rankImproved, pChange, scoreChange, simChange) + "\n")
222 |   }
223 | 
224 |   def toChangeInRanksAndP(formulaMap: Map[String, List[String]]): Map[String, (Double, Double, Double)] = {
225 |     formulaMap.toList.map(t => {
226 |       val (key, values) = t
227 | 
228 |       val db1ImplFacts = sortByRHS(db1, key, formulaMap)
229 |       val db2ImplFacts = sortByRHS(db2, key, formulaMap)
230 | 
231 |       val compared = compare(db1ImplFacts, db2ImplFacts).take(1000)
232 | 
233 |       //println(compared.mkString("\n"))
234 | 
235 |       val rankChange = compared.values.map(_._1).sum / compared.size.toDouble
236 |       val pChange = compared.values.map(_._2).sum / compared.size
237 |       val rankImproved = compared.values.map(_._1).count(_ > 0) / compared.size.toDouble
238 | 
239 |       //println(s"\tavg rank change: ${if (rankChange > 0) "+" + rankChange else rankChange}")
240 |       //println(s"\tavg p change:    $pChange")
241 |       key -> (rankChange, rankImproved, pChange)
242 |     }).toMap
243 |   }
244 | 
245 |   def sortByRHS(db: SPDB, relation: String, rhsToLhsMap: Map[String, List[String]]): Map[Fact, (Int, Double)] =
246 |     getTopKFactsPerRel(db, relation).zipWithIndex
247 |     .filter { case (f, ix) => existsPremise(db1, f, rhsToLhsMap) }
248 |     .map { case (f, ix) => f -> (ix,  db.prob(f)) }
249 |     .toMap
250 | 
251 |   def getTopKFactsPerRel(db: SPDB, relation: String, k: Int = Int.MaxValue): Seq[Fact] =
252 |     db.facts(db.relation(relation).get).sortBy(f => -db.prob(f)).take(k)
253 | 
254 |   def existsPremise(db: SPDB, fact: Fact, rhsToLhsMap: Map[String, List[String]]): Boolean =
255 |     rhsToLhsMap(fact.relation.name).exists(lhs => db.fact(fact.args, db.relation(lhs).get).isDefined)
256 | 
257 |   def compare(map1: Map[Fact, (Int, Double)], map2: Map[Fact, (Int, Double)]): Map[Fact, (Int, Double)] =
258 |     (map1.keySet intersect map2.keySet).map(k => {
259 |       val (ix1, p1) = map1(k)
260 |       val (ix2, p2) = map2(k)
261 |       k -> (ix1 - ix2, p2 - p1)
262 |     }).toMap
263 | 
264 |   import RuleFinder.implScore
265 |   import RuleFinder.implNegScore
266 |   import RuleFinder.implScoreTruePremise
267 |   import RuleFinder.implScoreTrain
268 | 
269 |   def toChangeInFormulaeSimAndScore(formulaMap: Map[String, List[String]]): Map[String, (Double, Double, Double)] = {
270 |     formulaMap.toList.map(t => {
271 |       val (key, values) = t
272 |       var scoreChange = 0.0
273 |       var scoreObsChange = 0.0
274 |       var simChange = 0.0
275 |       for {
276 |         lhsName <- values
277 |         rhsName = key
278 |         rhsDb1 = db1.relation(rhsName).get
279 |         lhsDb1 = db1.relation(lhsName).get
280 |         rhsDb2 = db2.relation(rhsName).get
281 |         lhsDb2 = db2.relation(lhsName).get
282 |         lhsEmbeddingDb1 = lhsDb1.embedding(db1)
283 |         rhsEmbeddingDb1 = rhsDb1.embedding(db1)
284 |         lhsEmbeddingDb2 = lhsDb1.embedding(db2)
285 |         rhsEmbeddingDb2 = rhsDb1.embedding(db2)
286 |         db1Sim = lhsEmbeddingDb1 cosineSimilarity rhsEmbeddingDb1
287 |         db2Sim = lhsEmbeddingDb2 cosineSimilarity rhsEmbeddingDb2
288 |       } {
289 |         val scoreDb1 = implScore(lhsDb1, rhsDb1, db1.trainFacts.map(_.args))(db1)._1
290 |         val scoreDb2 = implScore(lhsDb2, rhsDb2, db2.trainFacts.map(_.args))(db2)._1
291 | 
292 |         val scoreObsDb1 = implScore(lhsDb1, rhsDb1, rowsWithTrueLhs(db1, lhsName))(db1)._1
293 |         val scoreObsDb2 = implScore(lhsDb2, rhsDb2, rowsWithTrueLhs(db2, lhsName))(db2)._1
294 | 
295 |         simChange += (db2Sim - db1Sim)
296 |         scoreChange += (scoreDb2 - scoreDb1)
297 |         scoreObsChange += (scoreObsDb2 - scoreObsDb1)
298 |       }
299 |       key -> (simChange / values.size, scoreChange / values.size, scoreObsChange / values.size)
300 |     }).toMap
301 |   }
302 | 
303 |   def rowsWithTrueLhs(db: SPDB, lhs: String): Seq[List[Entity]] =
304 |     db.facts(db.relation(lhs).get).filter(_.train).map(_.args)
305 | 
306 |   println("A => B")
307 |   printChanges(implRhsToLhsMap)
308 |   println("\nA => !B")
309 |   printChanges(implNegRhsToLhsMap)
310 | }
311 | */
312 | 
313 | object FormulaeFilter extends App {
314 |   val filePath = args.lift(0).getOrElse("data/formulae/10000.txt")
315 |   val outputPath = args.lift(1).getOrElse("data/formulae/10000-filtered.txt")
316 |   val fileWriter = new FileWriter(outputPath)
317 | 
318 |   val rules = Source.fromFile(filePath, "iso-8859-1").getLines().mkString("\n").split("\n\n")
319 |   for {
320 |     rule <- rules
321 |     Array(stats, formula) = rule.split("\n")
322 |     tmp = stats.drop(2).split("\t")
323 |     Array(rank, mfHint, dataHint, numPremises) = tmp
324 |     if !formula.startsWith("//")
325 |     Array(premise, consequent) = formula.split(" => ")
326 |   } {
327 |     if (false && dataHint.toDouble > 0.75)
328 |       fileWriter.write(rule + "\n\n")
329 |     else if (dataHint.toDouble <= 0.01 && mfHint.toDouble >= 0.8)
330 |       fileWriter.write(stats + "\n" + premise + " => !" + consequent + "\n\n")
331 |   }
332 | 
333 |   fileWriter.close()
334 | }
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/FormulaeExtractor.scala:
--------------------------------------------------------------------------------
  1 | package uclmr.util
  2 | 
  3 | import java.io.FileWriter
  4 | 
  5 | import uclmr.{Formula, Impl, ImplNeg, TensorKB}
  6 | import ml.wolfe.util.ProgressBar
  7 | 
  8 | 
  9 | /**
 10 |  * @author rockt
 11 |  */
 12 | object FormulaeExtractor extends App {
 13 |   type Rule = Formula
 14 |   type Entity = Any
 15 |   type Relation = Any
 16 |   type SPDB = TensorKB
 17 | 
 18 |   def formulaScore(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = {
 19 |     val rows = pairs.filter(argFilter).map(_.head)
 20 |     (rows.map(e => rule(e)).sum / rows.size, rows.size)
 21 |   }
 22 | 
 23 |   /**
 24 |    * Calculates the weight of the formula based on matrix factorization predictions on observed premises.
 25 |    */
 26 |   def formulaScoreMF(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = {
 27 |     val p1 = rule.predicates(0)
 28 |     val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) }
 29 | 
 30 |     //we only care about the score over true observed premises
 31 |     val filteredRows = rows.filter(argFilter).map(_.head).filter(e => {
 32 |       val cell = db.get(p1,e).get
 33 |       cell.train && cell.target == 1.0
 34 |     })
 35 | 
 36 |     (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size)
 37 |   }
 38 | 
 39 |   /**
 40 |    * Calculates the weight of the formula based on matrix factorization predictions.
 41 |    */
 42 |   def formulaScoreMFPredicted(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true, threshold: Double = 0.1, onlyUnobserved: Boolean = true)(implicit db: SPDB): (Double, Int) = {
 43 |     val p1 = rule.predicates(0)
 44 |     val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) }
 45 | 
 46 |     //we only care about the score over true predicted premises
 47 |     val filteredRows = rows.filter(argFilter).map(_.head).filter(e => db.prob(p1,e) >= threshold && (!onlyUnobserved || !db.get(p1, e).get.train))
 48 | 
 49 |     if (filteredRows.isEmpty) (1.0, 0) else (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size)
 50 |   }
 51 | 
 52 |   /**
 53 |    * Calculates the weight of the formula based on the training data.
 54 |    */
 55 |   def formulaScoreData(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = {
 56 |     val p1 = rule.predicates(0)
 57 |     val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) }
 58 | 
 59 |     //we only care about the score over true observed premises
 60 |     val filteredRows = rows.filter(argFilter).map(_.head).filter(e => {
 61 |       val cell = db.get(p1,e).get
 62 |       cell.train && cell.target == 1.0
 63 |     })
 64 | 
 65 |     (filteredRows.map(e => rule match {
 66 |       case Impl(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 0.0) 0.0 else 1.0
 67 |       case ImplNeg(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 1.0) 0.0 else 1.0
 68 |     }).sum / filteredRows.size, filteredRows.size)
 69 |   }
 70 | 
 71 |   def implScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 72 |     formulaScore(Impl(r1,r2), pairs)
 73 | 
 74 |   def implNegScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 75 |     formulaScore(ImplNeg(r1,r2), pairs)
 76 | 
 77 |   def implScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 78 |     formulaScoreMF(Impl(r1,r2), pairs)
 79 | 
 80 |   def implNegScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 81 |     formulaScoreMF(ImplNeg(r1,r2), pairs)
 82 | 
 83 |   def implScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 84 |     formulaScoreMF(Impl(r1,r2), pairs)
 85 | 
 86 |   def implNegScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 87 |     formulaScoreMF(ImplNeg(r1,r2), pairs)
 88 | 
 89 |   def implScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 90 |     formulaScoreData(Impl(r1,r2), pairs)
 91 | 
 92 |   def implNegScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) =
 93 |     formulaScoreData(ImplNeg(r1,r2), pairs)
 94 | 
 95 | 
 96 | 
 97 |   lazy val testRelations = Seq(
 98 |     "person/company",
 99 |     "location/containedby",
100 |     "person/nationality",
101 |     "author/works_written",
102 |     "parent/child",
103 |     "person/place_of_birth",
104 |     "person/place_of_death",
105 |     "neighborhood/neighborhood_of",
106 |     "person/parents",
107 |     "company/founders",
108 |     "sports_team/league",
109 |     "team_owner/teams_owned",
110 |     "team/arena_stadium",
111 |     "film/directed_by",
112 |     "roadcast/area_served",
113 |     "structure/architect",
114 |     "composer/compositions",
115 |     "person/religion",
116 |     "film/produced_by"
117 |   ).toSet
118 | 
119 |   def consequentFilter(r: Relation) = testRelations.exists(s => r.asInstanceOf[String].contains(s))
120 | 
121 | 
122 |   implicit val db = new SPDB
123 | 
124 |   println("Loading db...")
125 |   db.deserialize(args.lift(0).getOrElse("wolfe-apps/data/out/F/serialized/"))
126 |   println(db.toInfoString)
127 | 
128 |   val premises = db.relations
129 |   //val consequents = db.relations.filter(consequentFilter)
130 |   val consequents = db.relations
131 | 
132 |   val rows = db.trainCells.map(_.key2).distinct.map(List(_))
133 |     //.map { case (ei, ej) => List(ei, ej) }
134 | 
135 |   println("Generating formulae...")
136 |   val progressBar = new ProgressBar(consequents.size * premises.size, 1000)
137 |   progressBar.start()
138 |   
139 |   val potentialRules = for {
140 |     consequent <- consequents
141 |     premise <- premises
142 |     if premise != consequent
143 |   } yield {
144 |     val (scoreMF, numPremisesMF) = implScoreMF(premise, consequent, rows)
145 |     val (scoreData, _) = implScoreData(premise, consequent, rows)
146 |     progressBar.apply(consequent.toString)
147 |     (scoreMF, scoreData, numPremisesMF, premise, consequent)
148 |   }
149 | 
150 |   println()
151 |   println("Writing formulae...")
152 |   val ruleWriter = new FileWriter("wolfe-apps/data/formulae/latest.txt")
153 |   potentialRules
154 |   //.filter(_._2 >= 0.9)
155 |   .filter(_._3 >= 10)
156 |   .sortBy(-_._1)
157 |   //.sortBy(-_._2)
158 |   .take(100000)
159 |   .zipWithIndex
160 |   .foreach(z => {
161 |     val (t, ix) = z
162 |     ruleWriter.write("//%d\t%.2f\t%.2f\t%d\n".format(ix + 1, t._1, t._2, t._3))
163 |     ruleWriter.write(s"${t._4} => ${t._5}\n\n")
164 |   })
165 |   ruleWriter.close()
166 | }


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/OptimiseMatrixFactorizationHyperParameters.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.util
 2 | 
 3 | import ml.wolfe.util._
 4 | import uclmr.MatrixFactorization
 5 | 
 6 | 
 7 | /**
 8 |  * Created by Ingolf on 06/11/2014.
 9 |  */
10 | object OptimiseMatrixFactorizationHyperParameters extends App {
11 |   val mfp = new MatrixFactorisationProblem()
12 | 
13 |   val myOptimizer: HyperParameterOptimisationAlgorithm = new NelderMeadSimplex()
14 |   myOptimizer.optimise(mfp, Map[String, Double]("mf.lambda" -> 0.01, "mf.alpha" -> 0.1))
15 | 
16 |   println("Best wMAP: " + myOptimizer.bestScore)
17 |   println("Best parameters:\n" + myOptimizer.bestParameters)
18 | }
19 | 
20 | class MatrixFactorisationProblem extends OptimisationProblem {
21 |   override val parametersToOptimize: Seq[HyperParameter] = Seq(HyperParameter("mf.lambda"), HyperParameter("mf.alpha"))
22 |   val startingValues = Map[String, Double]("mf.lambda" -> 0.01, "mf.alpha" -> 0.1)
23 | 
24 |   /**
25 |    * Evaluate the optimisation problem given the set of hyper parameters.
26 |    * @param hyperparameters The map of hyper parameters
27 |    * @return The score of this evaluation, higher is better
28 |    */
29 |   override def evaluate(hyperparameters: Map[String, Double]): Double = {
30 |     val confPath = "conf/mf-debug.conf"
31 |     val newConfPath = "conf/mf-hyper.conf"
32 | 
33 |     OverrideConfig(hyperparameters, newConfPath, confPath)
34 | 
35 |     val mf = new MatrixFactorization(newConfPath)
36 | 
37 |     -mf.run()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/Predictor.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.util
 2 | 
 3 | import java.io.FileWriter
 4 | 
 5 | import uclmr.hack.EntityHackNormalization
 6 | import uclmr.{DefaultIx, TensorKB}
 7 | 
 8 | /**
 9 |  * @author rockt
10 |  */
11 | object Predictor extends App {
12 |   val pathToMatrix = args.lift(0).getOrElse("./data/out/bbc/serialized/")
13 |   val outFile = args.lift(1).getOrElse("./data/out/bbc/predictions.txt")
14 |   val relations = if (args.size > 2) args.tail else Array(
15 |     "REL$/location/administrative_division/country",
16 |     "REL$/base/biblioness/bibs_location/country",
17 |     "REL$/location/location/contains",
18 |     "REL$/people/person/nationality",
19 |     "REL$/base/aareas/schema/administrative_area/administrative_parent",
20 |     "REL$/location/country/first_level_divisions",
21 |     "REL$/location/country/capital"
22 |   )
23 | 
24 |   println("Loading db...")
25 |   val kb = new TensorKB(100)
26 |   kb.deserialize(pathToMatrix)
27 | 
28 |   println(kb.toInfoString)
29 | 
30 |   println("Predicting facts...")
31 |   val predictions = relations.map(rel => rel -> kb.keys2
32 |     .filterNot(t => kb.getFact(rel, t, DefaultIx).exists(_.train))
33 |     .map(t => {
34 |       (kb.prob(rel, t), t)
35 |     }).sortBy(-_._1)
36 |   ).toMap
37 | 
38 |   println("Reporting predictions...")
39 | 
40 |   if (true || args.size > 1) {
41 | 
42 |     val writer = new FileWriter(outFile)
43 | 
44 |     EntityHackNormalization.init()
45 | 
46 |     predictions.foreach(t => t._2.take(100).foreach { case (score, es) =>
47 |       val Array(e1, e2) = es.toString.tail.init.split(",")
48 |       val can1 = if (e1.startsWith("/m/")) EntityHackNormalization.getCanonical(e1) else e1
49 |       val can2 = if (e2.startsWith("/m/")) EntityHackNormalization.getCanonical(e2) else e2
50 | 
51 |       writer.write(s"$score\t$e1\t$can1\t$e2\t$can2\t${ t._1 }\n")
52 |     })
53 |     writer.close()
54 |   } else {
55 |     predictions.foreach(t => t._2.take(100).foreach { case (score, es) =>
56 |       val Array(e1, e2) = es.toString.tail.init.split(",")
57 |       println(s"$score\t$e1\t$e2\t${ t._1 }")
58 |     })
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/SubsampleExperiments.scala:
--------------------------------------------------------------------------------
 1 | package uclmr.util
 2 | 
 3 | import java.io.File
 4 | 
 5 | import ml.wolfe.util.{Conf, OverrideConfig, ProgressBar, RunExperimentSeries}
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | /**
10 |  * @author rockt
11 |  */
12 | object SubsampleExperiments extends App {
13 |   val threads = args.lift(0).getOrElse("1").toInt
14 |   val formulaeFile = args.lift(1).getOrElse("data/formulae/filtered.txt")
15 |   val confPath = args.lift(2).getOrElse("conf/mf.conf")
16 |   val logFilePath = args.lift(3).getOrElse("data/out/experiments.log")
17 |   val runLogFilePath = args.lift(4).getOrElse("data/out/run.log")
18 |   val runLogFile = new File(runLogFilePath)
19 |   val runLogFileDir = new File(runLogFilePath.split("/").init.mkString("/"))
20 |   runLogFileDir.mkdirs()
21 |   runLogFile.createNewFile()
22 | 
23 | 
24 |   Conf.add(OverrideConfig(Map("logFile" -> logFilePath), confPath + ".tmp", confPath))
25 | 
26 |   val rand = new Random(0l)
27 | 
28 |   val series = Map(
29 |     "mf.subsample" -> (0 to 20).map(_ / 40.0).toSeq,
30 |     "mf.mode" -> Seq("mf", "low-rank-logic", "pre-inference", "post-inference", "inference-only"),
31 |     "evalConf" -> Seq("eval-subsample.conf")
32 |   ).mapValues(rand.shuffle(_))
33 | 
34 | 
35 |   import scala.sys.process._
36 |   val userDir = System.getProperty("user.dir")
37 | 
38 |   //first compile project for all workers so that there will be no clashes
39 |   Process(Seq("sbt", "compile"), new File(userDir)).!
40 | 
41 | 
42 |   val progressBar = new ProgressBar(series.values.map(_.size).product, 1)
43 |   progressBar.start()
44 | 
45 |   RunExperimentSeries(series, threads, confPath) { conf =>
46 |     (Process(Seq(
47 |       "sbt",
48 |       "vmargs -Xmx4G",
49 |       s"run-main uclmr.MatrixFactorization $conf"), new File(userDir)
50 |     ) #>> runLogFile).!!
51 | 
52 |     progressBar(conf)
53 |   }
54 | 
55 |   System.exit(0)
56 | }
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/src/main/scala/uclmr/util/VectorInspector.scala:
--------------------------------------------------------------------------------
  1 | package uclmr.util
  2 | 
  3 | import cc.factorie.la._
  4 | import ml.wolfe.FactorieVector
  5 | import uclmr.io.LoadNAACL
  6 | import uclmr.{TensorKB, Impl, TensorDB}
  7 | import ml.wolfe.util.Conf
  8 | import ml.wolfe.util.RichCollections._
  9 | 
 10 | import scala.collection.IterableLike
 11 | import scala.collection.generic.CanBuildFrom
 12 | 
 13 | /**
 14 |  * @author rockt
 15 |  */
 16 | object VectorInspector extends App {
 17 |   def calculateLengthsAndAngle(v1: FactorieVector, v2: FactorieVector): (Double, Double, Double) = {
 18 |     val length1 = math.sqrt(v1 dot v1)
 19 |     val length2 = math.sqrt(v2 dot v2)
 20 | 
 21 |     val angle = math.acos((v1 dot v2) / (length1 * length2)) * (180 / math.Pi)
 22 | 
 23 |     (length1, length2, angle)
 24 |   }
 25 | 
 26 |   val pathToDB = args.lift(0).getOrElse("./data/out/F-Joint")
 27 | 
 28 |   Conf.add(args.lift(1).getOrElse("./conf/mf.conf"))
 29 | 
 30 |   implicit val db = new TensorKB(100)
 31 |   println("Deserializing DB...")
 32 |   db.deserialize(pathToDB + "/serialized/")
 33 | 
 34 |   val db2 = LoadNAACL()
 35 | 
 36 |   val formulae = db2.formulae
 37 |   //val formulae = Seq(Impl("path#appos|->appos->capital->prep->of->pobj->|pobj", "REL$/location/location/containedby"))
 38 | 
 39 | 
 40 |   def analyzeLengthsAndAngles() = {
 41 |     println("Analyzing vectors...")
 42 |     val pathToAnnotatedFormulae = args.lift(1).getOrElse("./data/formulae/1000.txt")
 43 |     val pathToAllFormulae = args.lift(2).getOrElse("./data/formulae/10000.txt")
 44 | 
 45 |     def printStats(premise: String, consequent: String): (Double, Double, Double) = {
 46 |       val premiseVector = db.vector1(premise).get
 47 |       val consequentVector = db.vector1(consequent).get
 48 | 
 49 |       val (premiseLength, consequentLength, angle) = calculateLengthsAndAngle(premiseVector, consequentVector)
 50 | 
 51 |       val correctLength = if (premiseLength < consequentLength) "true" else "false"
 52 | 
 53 |       //println("%4.2f°".format(angle) + s"\t$premiseLength\t$consequentLength\t$correctLength")
 54 |       println("%4.2f°".format(angle) + s"\t$premiseLength\t$consequentLength")
 55 | 
 56 |       (premiseLength, consequentLength, angle)
 57 |     }
 58 | 
 59 |     val debug = false
 60 |     val dropFormulae = 0
 61 |     val numSamples = 10
 62 | 
 63 |     val pairsOfRelations =
 64 |       if (debug) Seq("path#poss|<-poss<-executive->appos->|appos:INV" -> "REL$/business/person/company")
 65 |       else {
 66 |         val annotatedFormulae = io.Source.fromFile(pathToAnnotatedFormulae).getLines().toList.drop(dropFormulae * 3)
 67 |                                 .filterNot(l => l.startsWith("//") || l.isEmpty)
 68 |                                 .map(_.split(" => ")).map(a => (a(0), a(1))).filterNot(_._2.startsWith("!"))
 69 | 
 70 |         val allFormulae = io.Source.fromFile(pathToAllFormulae).getLines().toList
 71 |                           .filterNot(l => l.startsWith("//") || l.isEmpty)
 72 |                           .map(_.split(" => ")).map(a => (a(0), a(1))).filterNot(_._2.startsWith("!"))
 73 | 
 74 |         annotatedFormulae.distinctBy(_._2).take(numSamples) ++ allFormulae.take(10000).distinctBy(_._2).takeRight(numSamples)
 75 |       }
 76 | 
 77 |     println(pairsOfRelations.take(numSamples).mkString("\n"))
 78 |     println()
 79 | 
 80 |     println(s"Top $numSamples implications")
 81 |     val top10Stats = pairsOfRelations.take(numSamples).map(t => printStats(t._1, t._2))
 82 |     val top10AvgLengthDiff = top10Stats.map(t => t._2 - t._1).sum / numSamples.toDouble
 83 |     val top10AvgAngle = top10Stats.map(_._3).sum / numSamples.toDouble
 84 |     println("Average length difference: " + top10AvgLengthDiff)
 85 |     println("Average angle: " + top10AvgAngle)
 86 |     println()
 87 |     println(s"Bottom $numSamples implications")
 88 |     val least10Stats = pairsOfRelations.takeRight(numSamples).map(t => printStats(t._1, t._2))
 89 |     val least10AvgLengthDiff = least10Stats.map(t => t._2 - t._1).sum / numSamples.toDouble
 90 |     val least10AvgAngle = least10Stats.map(_._3).sum / numSamples.toDouble
 91 |     println("Average length difference: " + least10AvgLengthDiff)
 92 |     println("Average angle: " + least10AvgAngle)
 93 |   }
 94 | 
 95 |   val entityPairs = db.trainCells.map(_.key2).distinct.map(List(_))
 96 | 
 97 |   def analyzeAsymmetry() = {
 98 |     val tmp = formulae.map {
 99 |       case Impl(p1, p2, _) =>
100 |         val p1Vector = db.vector1(p1).get
101 |         val p2Vector = db.vector1(p2).get
102 |         val (p1Length, p2Length, angle) = VectorInspector.calculateLengthsAndAngle(p1Vector, p2Vector)
103 |         val lengthDiff = p2Length - p1Length
104 |         //val (mfScore, numPremises) = FormulaeExtractor.formulaScoreMF(Impl(p1, p2), entityPairs)
105 |         //val (mfScoreInv, numPremisesInv) = FormulaeExtractor.formulaScoreMF(Impl(p2, p1), entityPairs)
106 |         val (mfScore, numPremises) = FormulaeExtractor.formulaScoreMFPredicted(Impl(p1, p2), entityPairs)
107 |         val (mfScoreInv, numPremisesInv) = FormulaeExtractor.formulaScoreMFPredicted(Impl(p2, p1), entityPairs)
108 | 
109 |         //println(p1 + "\t" + p1Vector.toArray.mkString("\t"))
110 |         //println(p2 + "\t" + p2Vector.toArray.mkString("\t"))
111 | 
112 |         val maxPremise = ArgMaxSigmoid(p1Vector)
113 |         val maxConsequent = ArgMaxSigmoid(p2Vector)
114 | 
115 |         /*
116 |         println("permise(p-premise):\t" + ArgMaxSigmoid.sig(maxPremise dot p1Vector))
117 |         println("consequent(p-premise):\t" + ArgMaxSigmoid.sig(maxPremise dot p2Vector))
118 |         println("premise(p-consequent):\t" + ArgMaxSigmoid.sig(maxConsequent dot p1Vector))
119 |         println("consequent(p-consequent):\t" + ArgMaxSigmoid.sig(maxConsequent dot p2Vector))
120 |         */
121 | 
122 |         //println(f"%%4.2f [%%d]\t%%4.2f [%%d]\t%%6.4f\t%%6.4f\t%%6.4f\t%%4.2f°\t$p1\t$p2".format(
123 |         println(f"%%4.2f [%%d]\t%%4.2f [%%d]\t%%6.4f\t%%6.4f\t%%6.4f\t%%4.2f°\t${ArgMaxSigmoid.sig(maxPremise dot p2Vector)}\t${ArgMaxSigmoid.sig(maxConsequent dot p1Vector)}\t$p1\t$p2".format(
124 |           mfScore, numPremises, mfScoreInv, numPremisesInv, p1Length, p2Length, lengthDiff, angle
125 |         ))
126 | 
127 |         (mfScore, mfScoreInv, angle, lengthDiff)
128 |     }
129 | 
130 | 
131 |     println("Avg A=>B score:\t" + (tmp.map(_._1).sum / tmp.length.toDouble))
132 |     println("Avg B=>A score:\t" + (tmp.map(_._2).sum / tmp.length.toDouble))
133 |     println("Avg angle:\t" + (tmp.map(_._3).sum / tmp.length.toDouble))
134 |     println("Avg length diff:\t" + (tmp.map(_._4).sum / tmp.length.toDouble))
135 |   }
136 | 
137 |   //analyzeLengthsAndAngles()
138 |   analyzeAsymmetry()
139 | }
140 | 
141 | object VectorInspectorSpec extends App {
142 |   println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(3.0, 0.0)), new DenseTensor1(Array(5.0, 5.0)))._3)
143 |   println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(3.0, 4.0)), new DenseTensor1(Array(-8.0, 6.0)))._3)
144 |   println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(5.0, 6.0)), new DenseTensor1(Array(-1.0, 4.0)))._3)
145 |   println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(3.0, 5.0)), new DenseTensor1(Array(-1.0, 6.0)))._3)
146 | }


--------------------------------------------------------------------------------
/src/test/scala/uclmr/PotentialsSpec.scala:
--------------------------------------------------------------------------------
 1 | package uclmr
 2 | 
 3 | import ml.wolfe.fg.{BPRPotential, CellLogisticLoss, L2Regularization, VectorMsgs}
 4 | import ml.wolfe.util.PotentialDebugger
 5 | import ml.wolfe.{DenseVector, FactorGraph}
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | /**
10 |  * @author rockt
11 |  */
12 | object PotentialsSpec extends App {
13 |   //building factor graph
14 |   val fg = new FactorGraph()
15 |   val n1 = fg.addVectorNode(100, "c")
16 |   val n2 = fg.addVectorNode(100, "p1")
17 |   val n3 = fg.addVectorNode(100, "p2")
18 |   val n4 = fg.addVectorNode(100, "c2")
19 | 
20 |   val lambda = 0.01
21 | 
22 |   fg.buildFactor(Seq(n1, n2))(_ map (_ => new VectorMsgs)) {
23 |     e => new CellLogisticLoss(e(0), e(1), 1.0, lambda) with L2Regularization
24 |   }
25 | 
26 |   fg.buildFactor(Seq(n1, n3))(_ map (_ => new VectorMsgs)) {
27 |     e => new CellLogisticLoss(e(0), e(1), 0.0, lambda) with L2Regularization
28 |   }
29 | 
30 |   fg.buildFactor(Seq(n1, n3))(_ map (_ => new VectorMsgs)) {
31 |     e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, 0.5) with L2Regularization
32 |   }
33 | 
34 |   fg.buildFactor(Seq(n1, n2, n3))(_ map (_ => new VectorMsgs)) {
35 |     e => new ImplPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization
36 |   }
37 | 
38 |   fg.buildFactor(Seq(n1, n2, n3))(_ map (_ => new VectorMsgs)) {
39 |     e => new ImplPotential(e(0), e(1), e(2), 1.0, lambda, 10.0) with L2Regularization
40 |   }
41 | 
42 |   fg.buildFactor(Seq(n1, n3, n2))(_ map (_ => new VectorMsgs)) {
43 |     e => new ImplNegPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization
44 |   }
45 | 
46 |   fg.buildFactor(Seq(n1, n3, n2))(_ map (_ => new VectorMsgs)) {
47 |     e => new ImplNegPotential(e(0), e(1), e(2), 1.0, lambda, 0.5) with L2Regularization
48 |   }
49 | 
50 | 
51 |   fg.buildFactor(Seq(n1, n4, n2))(_ map (_ => new VectorMsgs)) {
52 |     e => new BPRPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization
53 |   }
54 | 
55 |   fg.build()
56 | 
57 |   //initializing weights and messages
58 |   val rand = new Random(0l)
59 |   def nextInit() = rand.nextGaussian() * 0.1
60 |   Seq(n1, n2, n3, n4).foreach(n => {
61 |     val vec = new DenseVector((0 until 100).map(i => nextInit()).toArray)
62 |     n.variable.asVector.b = vec
63 |     n.variable.asVector.setting = vec
64 |   })
65 |   fg.factors.foreach(_.edges.foreach(e => e.msgs.asVector.n2f = e.n.variable.asVector.b))
66 | 
67 |   //gradient checking
68 |   fg.factors.map(_.potential).foreach(PotentialDebugger.checkGradients(_, debug = true))
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/scala/uclmr/TensorDBSpec.scala:
--------------------------------------------------------------------------------
 1 | package uclmr
 2 | 
 3 | import org.scalatest.{Matchers, WordSpec}
 4 | 
 5 | /**
 6 |  * Created by rockt on 19/09/2014.
 7 |  */
 8 | class TensorDBSpec extends WordSpec with Matchers {
 9 |   "A tensor DB" should {
10 |     "add and retrieve cells" in {
11 |       val db = new TensorDB()
12 |       //vector
13 |       db += Cell(Seq("a"))
14 |       db.get(Seq("a")) shouldBe Some(Cell(Seq("a")))
15 | 
16 |       db += Cell(Seq("a", "b"))
17 |       db.get(Seq("a", "b")) shouldBe Some(Cell(Seq("a", "b")))
18 | 
19 |       //matrix
20 |       db += Cell(1, 2)
21 |       db.get(1, 2) shouldBe Some(Cell(1, 2))
22 | 
23 |       //tensor
24 |       db += Cell(1, 2, 3)
25 |       db.get(1,2,3) shouldBe Some(Cell(1,2,3))
26 | 
27 |     }
28 | 
29 |     "be a matrix if cells are indexed by exactly two indices" in {
30 |       val db = new TensorDB()
31 |       db.isMatrix shouldBe false
32 | 
33 |       db += Cell("r")
34 |       db.isMatrix shouldBe false
35 | 
36 |       db += Cell("r1", "e1")
37 |       db.isMatrix shouldBe true
38 | 
39 |       db += Cell("r2", "e1", "e2")
40 |       db.isMatrix shouldBe false
41 |     }
42 | 
43 |     "be usable in a natural way for a knowledge base with binary relations" in {
44 |       val matrix = new TensorDB()
45 |       matrix.sampleTensor(10,5)
46 |       println(matrix.toVerboseString(showTrain = true))
47 | 
48 |       val tensor = new TensorDB()
49 |       tensor.sampleTensor(10,5,5)
50 | 
51 |       println(tensor.toVerboseString(showTrain = true))
52 |     }
53 | 
54 |     "be serializable and deserializable" in {
55 |       val db = new TensorDB(5)
56 |       db.sampleTensor(10, 10, 0, 0.1) //samples a matrix
57 |       db += Impl("r3", "r4")
58 |       db += Impl("r4", "r6")
59 |       db += Impl("r6", "r2")
60 | 
61 |       val fg = db.toFactorGraph
62 |       fg.build()
63 | 
64 |       db.serialize("/tmp/serialized/")
65 | 
66 | 
67 |       val db2 = new TensorDB(5)
68 |       db2.deserialize("/tmp/serialized/")
69 | 
70 |       db.cells.size == db2.cells.size
71 |       db.keys1.size == db2.keys1.size
72 |       db.keys2.size == db2.keys2.size
73 |       db.keys3.size == db2.keys3.size
74 |     }
75 |   }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------