├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── README.org ├── conf ├── defaul.conf ├── epl-ent.conf ├── epl-synth.conf ├── epl.conf ├── eval-epl.conf ├── eval-subsample.conf ├── eval.conf ├── mf-bpr.conf ├── mf-oc.conf ├── mf.conf ├── naacl2015-Joint.conf ├── naacl2015-MF.conf ├── naacl2015-Zero-Inf.conf ├── naacl2015-Zero-Joint.conf ├── naacl2015-Zero-MF.conf ├── naacl2015-Zero-Post.conf └── naacl2015-Zero-Pre.conf ├── data ├── eval │ ├── Set1.plt │ ├── eval.gpl │ ├── hist.gpl │ └── results.gpl ├── formulae │ └── filtered.txt └── naacl2013 │ ├── naacl2013.gold.tsv │ ├── nyt-freebase.test.subsample-10000-LABELED.tuples.txt │ ├── nyt-freebase.test.subsample-10000.tuples.txt │ └── structured │ ├── eval-naacl13-structured.out.txt │ ├── test-mintz09.txt │ ├── test-riedel13-model-F.txt │ ├── test-riedel13-model-N.txt │ ├── test-riedel13-model-NF.txt │ ├── test-riedel13-model-NFE.txt │ ├── test-rockt-F.txt │ ├── test-surdeanu12.txt │ └── test-yao11.txt ├── overview.png ├── project ├── Build.scala └── plugins.sbt └── src ├── main └── scala │ └── uclmr │ ├── AnnotationTool.scala │ ├── EmbeddedProbLogicEvaluation.scala │ ├── EntityAwareEvaluation.scala │ ├── EntityAwarePredictor.scala │ ├── FactorizationUtil.scala │ ├── GeometricMF.scala │ ├── LogicalInference.scala │ ├── MatrixFactorization.scala │ ├── OCSVM.scala │ ├── PimpMyFactorie.scala │ ├── ProbLogicEmbeddings.scala │ ├── SoftLogicPotentials.scala │ ├── TensorDB.scala │ ├── future │ ├── KB.scala │ └── MatrixFactorization2.scala │ ├── hack │ ├── CoNLLHackReader.scala │ └── MTShowcase.scala │ ├── io │ ├── FIGER.scala │ ├── FigerPB.scala │ ├── MatrixFilter.scala │ ├── NAACL.scala │ └── TSV.scala │ └── util │ ├── ArgMaxSigmoid.scala │ ├── DataInspector.scala │ ├── FormulaeAnnotator.scala │ ├── FormulaeExtractor.scala │ ├── OptimiseMatrixFactorizationHyperParameters.scala │ ├── Predictor.scala │ ├── SubsampleExperiments.scala │ └── VectorInspector.scala └── test └── scala └── uclmr ├── PotentialsSpec.scala └── TensorDBSpec.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # intellij 16 | .idea 17 | .idea/* 18 | .idea_modules 19 | !.idea/codeStyleSettings.xml 20 | *.iml -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "wolfe"] 2 | path = wolfe 3 | url = git@github.com:wolfe-pack/wolfe.git 4 | branch = dev 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.10.4 4 | jdk: 5 | - oraclejdk8 6 | 7 | #script: "sbt clean scoverage:test" 8 | script: "sbt test" 9 | 10 | #after_script: 11 | # 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013-2016 University College London 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | ** Low-rank Logic Embeddings 2 | 3 | [[https://travis-ci.org/uclmr/low-rank-logic][https://travis-ci.org/uclmr/low-rank-logic.svg?branch=master]] 4 | [[https://gitter.im/uclmr/low-rank-logic?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge][file:https://badges.gitter.im/Join%20Chat.svg]] 5 | [[https://waffle.io/uclmr/low-rank-logic][https://badge.waffle.io/uclmr/low-rank-logic.png?label=ready&title=Ready]] 6 | 7 | This repository contains code accompanying the paper: 8 | [[http://rockt.github.io/][Tim Rocktäschel]], [[http://sameersingh.org][Sameer Singh]] and [[http://www.riedelcastro.org/][Sebastian Riedel]]. _Injecting Logical Background Knowledge into Embeddings for Relation Extraction_. /in: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics – Human Language Technologies (NAACL HLT)/. 2015. [[[http://rockt.github.io/bib/rocktaschel2015injecting.bib][bib]]] [[[http://rockt.github.io/pdf/rocktaschel2015injecting.pdf][pdf]]] 9 | 10 | 11 | [[./overview.png]] 12 | 13 | 14 | *** Prerequisites 15 | - [[http://git-scm.com/][git]] to clone the repository 16 | - [[http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html][Java JDK]] and [[http://www.scala-sbt.org/][sbt]] to run code 17 | - [[http://www.gnuplot.info/][gnuplot]] to reproduce evaluation plots 18 | - pdflatex to reproduce pdf tables 19 | 20 | *** Download code 21 | **** Clone the repository 22 | #+BEGIN_SRC sh :session mf :results silent 23 | cd ~/workspace 24 | git clone https://github.com/uclmr/low-rank-logic.git 25 | git submodule update --init --recursive 26 | cd wolfe 27 | git checkout tags/v0.4.0 28 | cd .. 29 | #+END_SRC 30 | 31 | **** COMMENT Move to the project directory 32 | #+BEGIN_SRC sh :session mf :results silent 33 | cd ~/workspace/low-rank-logic 34 | #+END_SRC 35 | 36 | **** Compile and test 37 | #+BEGIN_SRC sh :session mf :results silent 38 | sbt clean compile test 39 | #+END_SRC 40 | 41 | 42 | *** Download data 43 | **** NAACL13 44 | If you have a license for the NYT corpus please write us a mail to obtain the link to the =naacl2013.txt.zip= file. 45 | 46 | ***** Move to resources 47 | #+BEGIN_SRC sh :session mf :results silent 48 | mv ~/Downloads/naacl2013.txt.zip ./src/main/resources 49 | #+END_SRC 50 | 51 | *** Comparison on Complete Data (Figure 3) 52 | **** Matrix factorization 53 | #+BEGIN_SRC sh :session mf :results silent 54 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-MF.conf' 55 | #+END_SRC 56 | 57 | **** Joint Optimization (Section 3.2) 58 | #+BEGIN_SRC sh :session mf :results silent 59 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Joint.conf' 60 | #+END_SRC 61 | 62 | *** Zero-shot Relation Learning (Table 1) 63 | - MF (matrix factorization) 64 | #+BEGIN_SRC sh :session mf :results silent 65 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-MF.conf' 66 | #+END_SRC 67 | - Inf (logical inference) 68 | #+BEGIN_SRC sh :session mf :results silent 69 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Inf.conf' 70 | #+END_SRC 71 | - Post (post-factorization inference) 72 | #+BEGIN_SRC sh :session mf :results silent 73 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Post.conf' 74 | #+END_SRC 75 | - Pre (pre-factorization inference) 76 | #+BEGIN_SRC sh :session mf :results silent 77 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Pre.conf' 78 | #+END_SRC 79 | - Joint (joint optimization) 80 | #+BEGIN_SRC sh :session mf :results silent 81 | sbt 'vmargs -Xmx8G' 'run-main uclmr.MatrixFactorization ./conf/naacl2015-Zero-Joint.conf' 82 | #+END_SRC 83 | 84 | **** Generating Figure 85 | #+BEGIN_SRC sh :session mf :results silent 86 | cd data/eval 87 | tail -105 ../out/experiments.log > subsample.log 88 | gnuplot -e 'fileName = "subsample"' eval.gpl 89 | open subsample-wMAP.pdf 90 | #+END_SRC 91 | 92 | *** Relations with Few Distant Labels (Figure 2) 93 | #+BEGIN_SRC sh :session mf :results silent 94 | sbt 'run-main uclmr.util.SubsampleExperiments 4' 95 | #+END_SRC 96 | =4= is the number of threads used to run experiments in parallel. 97 | 98 | *** Evaluate predictions 99 | #+BEGIN_SRC sh :session mf :results silent 100 | sbt 'run-main uclmr.io.EvaluateNAACL ./conf/eval.conf ./data/out/latest/predict.txt' 101 | #+END_SRC 102 | 103 | **** COMMENT Open PR curve 104 | #+BEGIN_SRC sh :session mf :results silent 105 | open ./data/out/latest/11pointPrecRecall.pdf 106 | #+END_SRC 107 | 108 | **** COMMENT Open results table 109 | #+BEGIN_SRC sh :session mf :results silent 110 | open ./data/out/latest/table.pdf 111 | #+END_SRC 112 | 113 | *** Citing 114 | #+BEGIN_SRC latex 115 | @inproceedings{rocktaschel2015injecting, 116 | title={{Injecting Logical Background Knowledge into Embeddings for Relation Extraction}}, 117 | author={Rockt{\"a}schel, Tim and Singh, Sameer and Riedel, Sebastian}, 118 | booktitle = {Conference of the North American Chapter of the Association for Computational Linguistics – Human Language Technologies (NAACL HLT)}, 119 | year={2015} 120 | } 121 | #+END_SRC 122 | -------------------------------------------------------------------------------- /conf/defaul.conf: -------------------------------------------------------------------------------- 1 | evalConf: "eval.conf" 2 | -------------------------------------------------------------------------------- /conf/epl-ent.conf: -------------------------------------------------------------------------------- 1 | epl { 2 | train: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.train.tuples.txt" 3 | unlabeled: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.tuples.txt" 4 | test: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt" 5 | 6 | print-comparisons: false 7 | 8 | use-unlabeled: true 9 | use-entitites: false 10 | 11 | combine-datasets: false 12 | min-rows: 20 13 | min-cols: 2 14 | min-cooccur: 1.0 15 | weigh-terms: false 16 | unit-ball:false 17 | l2-dist: true 18 | prior-repulsion: 0 19 | 20 | relation-dim: 20 21 | subsample: 0.5 22 | opt-iterations: 20 23 | norm-b: true 24 | trainer: "online" 25 | ada-rate: 1.0 26 | 27 | scale-prior: 1.0 28 | bias-prior: 0.0 29 | mult-prior: 1.0 30 | 31 | reg-embed: 0.1 32 | reg-scale: 0.1 33 | reg-bias: 0.1 34 | reg-mult: Infinity 35 | 36 | } -------------------------------------------------------------------------------- /conf/epl-synth.conf: -------------------------------------------------------------------------------- 1 | epl { 2 | relation-dim: 1 3 | subsample: 1.0 4 | opt-iterations: 1000 5 | norm-b: true 6 | ada-rate: 1.0 7 | trainer: "online" 8 | unit-ball: false 9 | l2-dist: true 10 | 11 | scale-prior: 1.0 12 | bias-prior: 0.0 13 | mult-prior: 1.0 14 | 15 | reg-embed: 0.01 16 | reg-scale: Infinity 17 | reg-bias: 0.01 18 | reg-mult: Infinity 19 | } -------------------------------------------------------------------------------- /conf/epl.conf: -------------------------------------------------------------------------------- 1 | epl { 2 | train: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.train.tuples.txt" 3 | unlabeled: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.tuples.txt" 4 | test: "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt" 5 | 6 | print-comparisons: false 7 | 8 | use-unlabeled: true 9 | use-entitites: true 10 | 11 | combine-datasets: false 12 | min-rows: 5 13 | min-cols: 2 14 | min-cooccur: 1.0 15 | weigh-terms: false 16 | unit-ball:false 17 | l2-dist: true 18 | prior-repulsion: 0 19 | 20 | relation-dim: 20 21 | subsample: 0.5 22 | opt-iterations: 100 23 | norm-b: true 24 | trainer: "online" 25 | ada-rate: 1.0 26 | 27 | scale-prior: 1.0 28 | bias-prior: 0.0 29 | mult-prior: 1.0 30 | 31 | reg-embed: 0.1 32 | reg-scale: 0.1 33 | reg-bias: 0.1 34 | reg-mult: Infinity 35 | 36 | } -------------------------------------------------------------------------------- /conf/eval-epl.conf: -------------------------------------------------------------------------------- 1 | eval { 2 | extra-relations: [] 3 | targets: ["person/company$","person/nationality$","team_owner/teams_owned$","company/founders$", 4 | "location/containedby$","neighborhood/neighborhood_of", 5 | "parent/child$","person/parents$","person/place_of_birth$","person/place_of_death$", 6 | "author/works_written$","team/arena_stadium$", 7 | "film/directed_by","roadcast/area_served$","structure/architect$", 8 | "composer/compositions$","sports_team/league$","person/religion$","film/produced_by$" 9 | ] 10 | //gold: "data/2014.gold.tsv" 11 | //gold: "data/annotations/latest.tsv" 12 | gold: "data/sriedel-annotation/latest.tsv" 13 | pool-depth: 100 14 | run-depth: 1000 15 | subsample: "data/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt" 16 | } -------------------------------------------------------------------------------- /conf/eval-subsample.conf: -------------------------------------------------------------------------------- 1 | eval { 2 | extra-relations: [] 3 | targets: ["person/company$","person/nationality$","company/founders$", 4 | "location/containedby$","neighborhood/neighborhood_of", 5 | "parent/child$","person/parents$","person/place_of_birth$","person/place_of_death$", 6 | "author/works_written$", 7 | "film/directed_by", "film/produced_by$" 8 | ] 9 | //gold: "data/2014.gold.tsv" 10 | //gold: "data/annotations/latest.tsv" 11 | gold: "data/naacl2013/naacl2013.gold.tsv" 12 | pool-depth: 100 13 | run-depth: 1000 14 | subsample: "data/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt" 15 | } -------------------------------------------------------------------------------- /conf/eval.conf: -------------------------------------------------------------------------------- 1 | eval { 2 | extra-relations: [] 3 | targets: ["person/company$","person/nationality$","team_owner/teams_owned$","company/founders$", 4 | "location/containedby$","neighborhood/neighborhood_of", 5 | "parent/child$","person/parents$","person/place_of_birth$","person/place_of_death$", 6 | "author/works_written$","team/arena_stadium$", 7 | "film/directed_by","roadcast/area_served$","structure/architect$", 8 | "composer/compositions$","sports_team/league$","person/religion$","film/produced_by$" 9 | ] 10 | //gold: "data/2014.gold.tsv" 11 | //gold: "data/annotations/latest.tsv" 12 | gold: "data/naacl2013/naacl2013.gold.tsv" 13 | pool-depth: 100 14 | run-depth: 1000 15 | subsample: "data/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt" 16 | } -------------------------------------------------------------------------------- /conf/mf-bpr.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: true 8 | 9 | subsample: 1.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "mf" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval.conf" 61 | -------------------------------------------------------------------------------- /conf/mf-oc.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.1 4 | alpha: 0.1 5 | maxIter: 1 6 | 7 | subsample: 1.0 8 | negPerPos: 1 9 | unobservedPerF: 1 10 | 11 | cellWeight: 1.0 12 | formulaeWeight: 1.0 13 | 14 | gamma: 0 //0.01 //0.01 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | use-features: false 24 | 25 | debug: true 26 | 27 | outFile: "predict.txt" 28 | } 29 | 30 | //formulaeFile: "data/formulae/curated-50-100.txt" 31 | //formulaeFile: "data/formulae/curated.txt" 32 | outDir: "data/out" 33 | logFile: "data/out/experiments.log" 34 | -------------------------------------------------------------------------------- /conf/mf.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 1.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "mf" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-Joint.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 1.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "low-rank-logic" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-MF.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 1.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "mf" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-Zero-Inf.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 0.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "inference-only" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval-subsample.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-Zero-Joint.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 0.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "low-rank-logic" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval-subsample.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-Zero-MF.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 0.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "mf" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval-subsample.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-Zero-Post.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 0.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "post-inference" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval-subsample.conf" 61 | -------------------------------------------------------------------------------- /conf/naacl2015-Zero-Pre.conf: -------------------------------------------------------------------------------- 1 | mf { 2 | k: 100 3 | lambda: 0.01 4 | alpha: 0.1 5 | maxIter: 200 6 | 7 | bpr: false 8 | 9 | subsample: 0.0 10 | negPerPos: 1 11 | unobservedPerF: 1 12 | 13 | cellWeight: 1.0 14 | formulaeWeight: 1.0 15 | 16 | batchTraining: false 17 | //optimizer: "SGD" 18 | optimizer: "AdaGrad" 19 | //optimizer: "AdaMira" 20 | //optimizer: "LBFGS" 21 | //optimizer: "AvgPerceptron" 22 | 23 | outFile: "predict.txt" 24 | 25 | use-features: false 26 | 27 | //formulaeStart: 0 28 | //formulaeEnd: 10 29 | minDataHint: 0.0 30 | minMFHint: 0.0 31 | minNumPremises: 0 32 | onlyAnnotated: true 33 | inject-rows: true 34 | test-row-terms: true 35 | force-symmetry: false 36 | 37 | mode: "pre-inference" 38 | } 39 | 40 | dataType: "naacl" 41 | 42 | naacl { 43 | formulaeFile: "data/formulae/filtered.txt" 44 | } 45 | 46 | figer { 47 | dataDir: "data/figer" 48 | formulaeFile: "None" 49 | prob-dev: 0.0 50 | prob-neg-labels: 0.1 51 | sample-train-entities: 0.01 52 | sample-train-facts: 0.01 53 | use-features: true 54 | } 55 | 56 | outDir: "data/out" 57 | logFile: "data/out/experiments.log" 58 | serialize: true 59 | inputFile: "naacl2013.txt.zip" 60 | evalConf: "eval-subsample.conf" 61 | -------------------------------------------------------------------------------- /data/eval/Set1.plt: -------------------------------------------------------------------------------- 1 | # line styles for ColorBrewer Set1 2 | # for use with qualitative/categorical data 3 | # provides 8 easy-to-name colors 4 | # compatible with gnuplot >=4.2 5 | # author: Anna Schneider 6 | 7 | # line styles 8 | set style line 1 lc rgb '#E41A1C' # red 9 | set style line 2 lc rgb '#377EB8' # blue 10 | set style line 3 lc rgb '#4DAF4A' # green 11 | set style line 4 lc rgb '#984EA3' # purple 12 | set style line 5 lc rgb '#FF7F00' # orange 13 | set style line 6 lc rgb '#FFFF33' # yellow 14 | set style line 7 lc rgb '#A65628' # brown 15 | set style line 8 lc rgb '#F781BF' # pink 16 | 17 | # palette 18 | set palette maxcolors 8 19 | set palette defined ( 0 '#E41A1C',\ 20 | 1 '#377EB8',\ 21 | 2 '#4DAF4A',\ 22 | 3 '#984EA3',\ 23 | 4 '#FF7F00',\ 24 | 5 '#FFFF33',\ 25 | 6 '#A65628',\ 26 | 7 '#F781BF' ) -------------------------------------------------------------------------------- /data/eval/eval.gpl: -------------------------------------------------------------------------------- 1 | # Chart settings 2 | #set title "Injecting Logic into Synthetic Matrices" 3 | set terminal dumb enhanced 4 | #set term x11 5 | #set termoption enhanced 6 | set key right center 7 | #set key width -1.5 8 | set key spacing 1.1 9 | set key box linewidth 3 10 | set border linewidth 3 11 | 12 | #load 'Spectral.plt' 13 | load 'Set1.plt' 14 | 15 | if (!exists("fileName")) fileName='experiments' 16 | 17 | set style line 3 lc rgb '#4DAF4A' lt 6 lw 1 18 | set style line 7 lc rgb '#984EA3' lw 1 19 | set style line 12 lc rgb '#FFC020' lt 4 lw 1 20 | set style line 13 lc rgb '#bbbbbb' lt 1 lw 2 21 | set style line 14 lc rgb '#dddddd' lt 4 lw 1 22 | set grid xtics mxtics ytics mytics back ls 13, ls 14 23 | 24 | 25 | set yrange [0.0:0.625] 26 | set xrange [0.0:0.5] 27 | set ytics 0.0,0.2,1 28 | set xtics 0.0,0.1,1 29 | 30 | set mytics 4 31 | set mxtics 4 32 | 33 | set xlabel "Fraction of Freebase training facts" 34 | set ylabel "MAP" 35 | 36 | 37 | mf = "< grep \"mf\" ".fileName.".log" 38 | low_rank_logic = "< grep \"low-rank-logic\" ".fileName.".log" 39 | inference_only = "< grep \"inference-only\" ".fileName.".log" 40 | pre_inference = "< grep \"pre-inference\" ".fileName.".log" 41 | post_inference = "< grep \"post-inference\" ".fileName.".log" 42 | pre_post_inference = "< grep \"pre-post-inference\" ".fileName.".log" 43 | 44 | #plot mf using 3:1 smooth unique with linespoints ls 1 linewidth 3 title "Matrix Factorization",\ 45 | low_rank_logic using 3:1 smooth unique with linespoints ls 2 linewidth 3 title "Joint Optimization",\ 46 | pre_inference using 3:1 smooth unique with linespoints ls 4 linewidth 3 title "Pre-Inference",\ 47 | post_inference using 3:1 smooth unique with linespoints ls 5 linewidth 3 title "Post-Inference",\ 48 | inference_only using 3:1 smooth unique with linespoints ls 3 linewidth 3 title "Logical Inference",\ 49 | 50 | # Wrapup 51 | #set terminal pdf enhanced dashed size 2.75,2 #1.75 #size 400,400 52 | #set terminal png truecolor size 500,500 53 | #set output fileName."-MAP.png" 54 | #set output fileName."-MAP.pdf" 55 | #refresh 56 | #unset output 57 | 58 | #set terminal dumb enhanced 59 | 60 | #set object 1 rect from 0,0.3 to 0.4,0.6 lw 6 fs empty border lc rgb 'gold' 61 | 62 | plot mf using 3:2 smooth unique with linespoints ls 1 linewidth 3 title "MF",\ 63 | low_rank_logic using 3:2 smooth unique with linespoints ls 2 linewidth 3 title "Joint",\ 64 | pre_inference using 3:2 smooth unique with linespoints ls 12 linewidth 3 title "Pre",\ 65 | post_inference using 3:2 smooth unique with linespoints ls 3 linewidth 3 title "Post",\ 66 | inference_only using 3:2 smooth unique with linespoints ls 7 linewidth 3 title "Inf" 67 | 68 | 69 | # Wrapup 70 | set ylabel "wMAP" 71 | set terminal pdf enhanced dashed size 2.75,1.85#,1.85 #1.75 #size 400,400 72 | set output fileName."-wMAP.pdf" 73 | #set terminal png truecolor size 500,500 74 | #set output fileName."-wMAP.png" 75 | refresh 76 | unset output -------------------------------------------------------------------------------- /data/eval/hist.gpl: -------------------------------------------------------------------------------- 1 | set terminal dumb enhanced 2 | #unset key 3 | 4 | binwidth=0.01 5 | bin(x,width)=width*floor(x/width) 6 | 7 | set xlabel "Distribution over length differences of implications" 8 | 9 | 10 | if (!exists("fileName")) fileName='lengths' 11 | 12 | mf_formulae = "< grep \"mf-formulae\" ".fileName.".txt" 13 | mf_sample = "< grep \"mf-sample\" ".fileName.".txt" 14 | joint_formulae = "< grep \"joint-formulae\" ".fileName.".txt" 15 | pre_formulae = "< grep \"pre-formulae\" ".fileName.".txt" 16 | 17 | plot mf_sample using (bin($1,binwidth)):(1.0) smooth freq with boxes title "mf-sample",\ 18 | mf_formulae using (bin($1,binwidth)):(1.0) smooth freq with boxes title "mf-formulae",\ 19 | pre_formulae using (bin($1,binwidth)):(1.0) smooth freq with boxes title "pre-formulae",\ 20 | joint_formulae using (bin($1,binwidth)):(1.0) smooth freq with boxes title "joint-formulae",\ 21 | 22 | 23 | # Wrapup 24 | set terminal pdf enhanced size 2.75,2 #1.75 #size 400,400 25 | #set terminal png truecolor size 500,500 26 | #set output fileName.".png" 27 | set output fileName.".pdf" 28 | refresh 29 | unset output -------------------------------------------------------------------------------- /data/eval/results.gpl: -------------------------------------------------------------------------------- 1 | # Chart settings 2 | #set title "Averaged 11-point Precision/Recall" 3 | set key right top 4 | set key width 0.5 5 | set key box 6 | 7 | load 'Set1.plt' 8 | #load 'Spectral.plt' 9 | 10 | 11 | set terminal pdf enhanced linewidth 3.0 size 3.000000,2.00000 12 | set output "results.pdf" 13 | # XYChart settings 14 | set nologscale 15 | 16 | 17 | #set style line 12 lc rgb '#bbbbbb' lt 1 lw 1 18 | #set style line 13 lc rgb '#cccccc' lt 4 lw 1 19 | #set grid xtics mxtics ytics mytics back ls 12, ls 13 20 | 21 | 22 | set yrange [0.0:1] 23 | set xrange [0.0:1] 24 | set ytics 0.2,0.2,1 25 | set xtics 0.0,0.1,1 26 | 27 | #set mytics 2 28 | #set mxtics 2 29 | 30 | 31 | set xr [0.0:1.0] noreverse 32 | set yr [0.0:1.0] noreverse 33 | set xlabel "Recall" 34 | set ylabel "Precision" 35 | 36 | 37 | # XYData Plotting 38 | plot \ 39 | '-' using 1:2 title "Joint" with linespoints ls 2, \ 40 | '-' using 1:2 title "MF" with linespoints ls 1, \ 41 | '-' using 1:2 title "Mintz09" with linespoints ls 12, \ 42 | '-' using 1:2 title "Yao11" with linespoints ls 4, \ 43 | '-' using 1:2 title "Surdeanu12" with linespoints ls 3, \ 44 | '-' using 1:2 title "Riedel13-F" with linespoints ls 9 45 | # Joint 46 | 0.0 1.0 47 | 0.1 0.9561403508771931 48 | 0.2 0.9175637958532697 49 | 0.3 0.8734179657354529 50 | 0.4 0.847352784814085 51 | 0.5 0.8281385859497583 52 | 0.6 0.6280542866809915 53 | 0.7 0.5553527881495134 54 | 0.8 0.4954569942312183 55 | 0.9 0.35796469257369756 56 | 1.0 0.273855414645719 57 | end 58 | # MF 59 | 0.0 1.0 60 | 0.1 0.9236842105263157 61 | 0.2 0.8796321945213911 62 | 0.3 0.8316317108560876 63 | 0.4 0.8000582913269824 64 | 0.5 0.7826510614100983 65 | 0.6 0.5984916612797895 66 | 0.7 0.5377585005461876 67 | 0.8 0.47144152503664516 68 | 0.9 0.33738750278770263 69 | 1.0 0.25211611853430466 70 | end 71 | # Mintz09 72 | 0.0 1.0 73 | 0.1 0.5680898049319102 74 | 0.2 0.49888269625111725 75 | 0.3 0.39253056884635834 76 | 0.4 0.34616228070175437 77 | 0.5 0.330326960694419 78 | 0.6 0.2625455040315721 79 | 0.7 0.2544227428517771 80 | 0.8 0.20727745289148797 81 | 0.9 0.1111017092570655 82 | 1.0 0.10526315789473684 83 | end 84 | # Yao11 85 | 0.0 1.0 86 | 0.1 0.7298830409356726 87 | 0.2 0.6976507005539263 88 | 0.3 0.6054098105574357 89 | 0.4 0.5561356142314963 90 | 0.5 0.5196400928792569 91 | 0.6 0.3274135882806314 92 | 0.7 0.3046111421649812 93 | 0.8 0.2690176216877453 94 | 0.9 0.12848082810905379 95 | 1.0 0.11842105263157894 96 | end 97 | # Surdeanu12 98 | 0.0 1.0 99 | 0.1 0.7976055002370792 100 | 0.2 0.7603247984826932 101 | 0.3 0.7165779313147734 102 | 0.4 0.692342471423568 103 | 0.5 0.6831544034214891 104 | 0.6 0.5072457004580417 105 | 0.7 0.4886786729963325 106 | 0.8 0.4152355565638234 107 | 0.9 0.30327260458839406 108 | 1.0 0.23157894736842108 109 | end 110 | # Riedel13-F 111 | 0.0 1.0 112 | 0.1 0.8756827967354283 113 | 0.2 0.8394421952316689 114 | 0.3 0.7904526759789917 115 | 0.4 0.7538715117662486 116 | 0.5 0.7354230316280177 117 | 0.6 0.5990688237411651 118 | 0.7 0.5388328295170529 119 | 0.8 0.488869738535952 120 | 0.9 0.3873569895215238 121 | 1.0 0.2823195983625368 122 | end 123 | 124 | unset output 125 | # Wrapup 126 | set terminal dumb 127 | refresh 128 | -------------------------------------------------------------------------------- /data/formulae/filtered.txt: -------------------------------------------------------------------------------- 1 | //1 0.97 0.96 27 curated 2 | path#nn|<-nn<-unit->prep->of->pobj->|pobj:INV => REL$/organization/parent/child 3 | 4 | //2 0.97 1.00 22 curated 5 | path#appos|->appos->subsidiary->prep->of->pobj->|pobj:INV => REL$/organization/parent/child 6 | 7 | //3 0.97 0.88 17 curated 8 | path#rcmod|->rcmod->own->prep->by->pobj->|pobj:INV => REL$/organization/parent/child 9 | 10 | //4 0.97 1.00 26 curated 11 | path#nn|<-nn<-city->prep->of->pobj->|pobj:INV => REL$/location/location/containedby 12 | 13 | //5 0.97 1.00 11 curated 14 | path#appos|->appos->subsidiary->nn->|nn:INV => REL$/organization/parent/child 15 | 16 | //6 0.97 0.97 100 curated 17 | path#poss|<-poss<-minister->appos->|appos:INV => REL$/people/person/nationality 18 | 19 | //7 0.97 0.96 50 curated 20 | path#appos|->appos->unit->prep->of->pobj->|pobj:INV => REL$/organization/parent/child 21 | 22 | //8 0.96 0.96 25 curated 23 | path#appos|->appos->division->prep->of->pobj->|pobj:INV => REL$/organization/parent/child 24 | 25 | //9 0.96 1.00 36 curated 26 | path#poss|<-poss<-executive->appos->|appos:INV => REL$/business/person/company 27 | 28 | //10 0.96 1.00 11 curated 29 | path#appos|->appos->co-founder->prep->of->pobj->|pobj:INV => REL$/business/company/founders 30 | 31 | //11 0.96 0.92 59 curated 32 | path#dobj|<-dobj<-review->prep->by->pobj->|pobj:INV => REL$/book/author/works_written 33 | 34 | //12 0.95 0.85 27 curated 35 | path#appos|->appos->founder->prep->of->pobj->|pobj:INV => REL$/business/company/founders 36 | 37 | //13 0.95 0.94 89 curated 38 | path#nn|<-nn<-town->prep->of->pobj->|pobj:INV => REL$/location/location/containedby 39 | 40 | //14 0.95 0.53 40 curated 41 | path#nn|<-nn<-neighborhood->prep->of->pobj->|pobj:INV => REL$/location/neighborhood/neighborhood_of 42 | 43 | //15 0.95 0.82 28 curated 44 | path#appos|->appos->director->dep->|dep:INV => REL$/film/film/directed_by 45 | 46 | //16 0.95 0.92 13 curated 47 | path#poss|<-poss<-region->nn->|nn:INV => REL$/location/location/containedby 48 | 49 | //17 0.94 0.92 13 curated 50 | path#appos|->appos->producer->dep->|dep:INV => REL$/film/film/produced_by 51 | 52 | //18 0.94 0.62 47 curated 53 | path#poss|<-poss<-film->dep->|dep:INV => REL$/film/film/directed_by 54 | 55 | //19 0.94 0.97 29 curated 56 | path#nsubj|<-nsubj<-professor->prep->at->pobj->|pobj:INV => REL$/location/location/containedby 57 | 58 | //20 0.94 0.58 24 curated 59 | path#poss|<-poss<-movie->dep->|dep:INV => REL$/film/film/directed_by 60 | 61 | //21 0.93 0.80 15 curated 62 | path#poss|<-poss<-leader->appos->|appos:INV => REL$/people/person/nationality 63 | 64 | //22 0.93 0.63 16 curated 65 | path#nn|<-nn<-film->dep->|dep:INV => REL$/film/film/directed_by 66 | 67 | //23 0.93 0.85 20 curated 68 | path#nn|<-nn<-suburb->prep->of->pobj->|pobj:INV => REL$/location/location/containedby 69 | 70 | //24 0.93 0.67 39 curated 71 | path#appos|->appos->daughter->prep->of->pobj->|pobj => REL$/people/person/parents 72 | 73 | //25 0.93 0.87 15 curated 74 | path#poss|<-poss<-chairman->appos->|appos:INV => REL$/business/person/company 75 | 76 | //26 0.93 0.92 12 curated 77 | path#nn|<-nn<-side->prep->of->pobj->|pobj:INV => REL$/location/location/containedby 78 | 79 | //27 0.93 0.64 216 curated 80 | path#nsubj|<-nsubj<-die->prep->in->pobj->|pobj => REL$/people/deceased_person/place_of_death 81 | 82 | //28 0.93 0.53 40 curated 83 | path#poss|<-poss<-neighborhood->nn->|nn:INV => REL$/location/neighborhood/neighborhood_of 84 | 85 | //29 0.91 0.93 15 curated 86 | path#nsubj|<-nsubj<-professor->appos->|appos:INV => REL$/location/location/containedby 87 | 88 | //30 0.91 0.76 33 curated 89 | path#nsubj|<-nsubj<-die->prep->at->pobj->hospital->prep->in->pobj->|pobj => REL$/people/deceased_person/place_of_death 90 | 91 | //31 0.91 0.47 45 curated 92 | path#poss|<-poss<-book->dep->|dep => REL$/book/author/works_written 93 | 94 | //32 0.90 0.87 15 curated 95 | path#nsubj|<-nsubj<-name->dobj->|dobj:INV => REL$/business/person/company 96 | 97 | //33 0.90 0.51 241 curated 98 | path#nsubjpass|<-nsubjpass<-bear->prep->in->pobj->|pobj => REL$/people/person/place_of_birth 99 | 100 | //34 0.90 0.48 153 curated 101 | path#appos|->appos->minister->poss->|poss => REL$/people/person/nationality 102 | 103 | //35 0.88 0.70 83 curated 104 | path#appos|->appos->capital->prep->of->pobj->|pobj => REL$/location/location/containedby 105 | 106 | //36 0.87 0.68 19 curated 107 | path#nsubj|<-nsubj<-city->prep->in->pobj->|pobj => REL$/location/location/containedby -------------------------------------------------------------------------------- /data/naacl2013/structured/eval-naacl13-structured.out.txt: -------------------------------------------------------------------------------- 1 | Reading in annotations... 2 | Collecting facts from rank files 3 | Loading Annotations 4 | Loading Rank Files 5 | Latex: 6 | \begin{center} 7 | \begin{tabular}{ l l | c c c c c c c } 8 | Relation & \# & MI09 & YA11 & SU12 & N & F & NF & NFE \\ 9 | \hline 10 | person/company & 103 & 0.67 & 0.64 & 0.70 & 0.73 & 0.75 & 0.76 & {\bf 0.79} \\ 11 | location/containedby & 74 & 0.48 & 0.51 & 0.54 & 0.43 & 0.68 & 0.67 & {\bf 0.69} \\ 12 | person/nationality & 29 & 0.13 & {\bf 0.39} & 0.12 & 0.14 & 0.18 & 0.18 & 0.21 \\ 13 | author/works\_written & 29 & 0.50 & 0.51 & 0.52 & 0.45 & 0.61 & 0.63 & {\bf 0.69} \\ 14 | parent/child & 19 & 0.14 & 0.25 & 0.62 & 0.46 & 0.76 & {\bf 0.78} & 0.76 \\ 15 | person/place\_of\_death & 19 & 0.79 & 0.79 & 0.86 & {\bf 0.89} & 0.83 & 0.85 & 0.86 \\ 16 | person/place\_of\_birth & 18 & 0.78 & 0.75 & 0.82 & 0.50 & 0.83 & 0.81 & {\bf 0.89} \\ 17 | neighborhood/neighborhood\_of & 12 & 0.00 & 0.00 & 0.08 & 0.43 & 0.65 & 0.66 & {\bf 0.72} \\ 18 | person/parents & 7 & 0.24 & 0.27 & 0.58 & 0.56 & 0.53 & {\bf 0.58} & 0.39 \\ 19 | company/founders & 4 & 0.25 & 0.25 & 0.53 & 0.24 & 0.77 & {\bf 0.80} & 0.68 \\ 20 | film/directed\_by & 4 & 0.06 & 0.15 & 0.25 & 0.09 & 0.26 & 0.26 & {\bf 0.30} \\ 21 | sports\_team/league & 4 & 0.00 & 0.43 & 0.18 & 0.21 & 0.59 & {\bf 0.70} & 0.63 \\ 22 | team/arena\_stadium & 3 & 0.00 & 0.06 & 0.06 & 0.03 & 0.08 & {\bf 0.09} & 0.08 \\ 23 | team\_owner/teams\_owned & 2 & 0.00 & 0.50 & 0.70 & 0.55 & 0.38 & 0.61 & {\bf 0.75} \\ 24 | roadcast/area\_served & 2 & {\em 1.00} & 0.50 & {\em 1.00} & 0.58 & 0.58 & 0.83 & {\em 1.00} \\ 25 | structure/architect & 2 & 0.00 & 0.00 & {\em 1.00} & 0.27 & {\em 1.00} & {\em 1.00} & {\em 1.00} \\ 26 | composer/compositions & 2 & 0.00 & 0.00 & 0.00 & 0.50 & 0.67 & {\bf 0.83} & 0.12 \\ 27 | person/religion & 1 & 0.00 & {\em 1.00} & {\em 1.00} & 0.50 & {\em 1.00} & {\em 1.00} & {\em 1.00} \\ 28 | film/produced\_by & 1 & {\em 1.00} & {\em 1.00} & {\em 1.00} & {\em 1.00} & 0.50 & 0.50 & 0.33 \\ 29 | \hline 30 | MAP & & 0.32 & 0.42 & 0.56 & 0.45 & 0.61 & 0.66 & 0.63 \\ 31 | Weighted MAP & & 0.48 & 0.52 & 0.57 & 0.52 & 0.66 & 0.67 & 0.69 \\ 32 | \end{tabular} 33 | \end{center} 34 | Summary: 35 | Pattern Gold+ Gold+- | MAP Missing | MAP Missing | MAP Missing | MAP Missing | MAP Missing | MAP Missing | MAP Missing 36 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 37 | person/company$ 103 166 | 0.67 0 | 0.64 0 | 0.70 0 | 0.73 0 | 0.75 0 | 0.76 0 | 0.79 0 38 | location/containedby$ 74 235 | 0.48 0 | 0.51 0 | 0.54 0 | 0.43 0 | 0.68 0 | 0.67 0 | 0.69 0 39 | person/nationality$ 29 278 | 0.13 0 | 0.39 0 | 0.12 0 | 0.14 0 | 0.18 0 | 0.18 0 | 0.21 0 40 | author/works_written$ 29 280 | 0.50 0 | 0.51 0 | 0.52 0 | 0.45 0 | 0.61 0 | 0.63 0 | 0.69 0 41 | parent/child$ 19 256 | 0.14 0 | 0.25 0 | 0.62 0 | 0.46 0 | 0.76 0 | 0.78 0 | 0.76 0 42 | person/place_of_death$ 19 221 | 0.79 0 | 0.79 0 | 0.86 0 | 0.89 0 | 0.83 0 | 0.85 0 | 0.86 0 43 | person/place_of_birth$ 18 216 | 0.78 0 | 0.75 0 | 0.82 0 | 0.50 0 | 0.83 0 | 0.81 0 | 0.89 0 44 | neighborhood/neighborhood_of 12 245 | 0.00 -1 | 0.00 -1 | 0.08 0 | 0.43 0 | 0.65 0 | 0.66 0 | 0.72 0 45 | person/parents$ 7 279 | 0.24 0 | 0.27 0 | 0.58 0 | 0.56 0 | 0.53 0 | 0.58 0 | 0.39 0 46 | company/founders$ 4 294 | 0.25 0 | 0.25 0 | 0.53 0 | 0.24 0 | 0.77 0 | 0.80 0 | 0.68 0 47 | film/directed_by 4 305 | 0.06 0 | 0.15 0 | 0.25 0 | 0.09 0 | 0.26 0 | 0.26 0 | 0.30 0 48 | sports_team/league$ 4 293 | 0.00 -1 | 0.43 0 | 0.18 0 | 0.21 0 | 0.59 0 | 0.70 0 | 0.63 0 49 | team/arena_stadium$ 3 220 | 0.00 0 | 0.06 0 | 0.06 0 | 0.03 0 | 0.08 0 | 0.09 0 | 0.08 0 50 | team_owner/teams_owned$ 2 229 | 0.00 0 | 0.50 0 | 0.70 0 | 0.55 0 | 0.38 0 | 0.61 0 | 0.75 0 51 | roadcast/area_served$ 2 297 | 1.00 0 | 0.50 0 | 1.00 0 | 0.58 0 | 0.58 0 | 0.83 0 | 1.00 0 52 | structure/architect$ 2 286 | 0.00 -1 | 0.00 -1 | 1.00 0 | 0.27 0 | 1.00 0 | 1.00 0 | 1.00 0 53 | composer/compositions$ 2 297 | 0.00 -1 | 0.00 -1 | 0.00 0 | 0.50 0 | 0.67 0 | 0.83 0 | 0.12 0 54 | person/religion$ 1 271 | 0.00 -1 | 1.00 0 | 1.00 0 | 0.50 0 | 1.00 0 | 1.00 0 | 1.00 0 55 | film/produced_by$ 1 291 | 1.00 0 | 1.00 0 | 1.00 0 | 1.00 0 | 0.50 0 | 0.50 0 | 0.33 0 56 | Average 0 0 | 0.32 -1 | 0.42 -1 | 0.56 -1 | 0.45 -1 | 0.61 -1 | 0.66 -1 | 0.63 -1 57 | Global 0 0 | 0.48 -1 | 0.52 -1 | 0.57 -1 | 0.52 -1 | 0.66 -1 | 0.67 -1 | 0.69 -1 58 | name MI09 YA11 SU12 N F NF NFE 59 | MI09 3/10 0.092 1/15 0.001 5/13 0.096 2/17 0.001 2/17 0.001 1/17 0.000 60 | MI09 3/10 0.092 1/15 0.001 5/13 0.096 2/17 0.001 2/17 0.001 1/17 0.000 61 | MI09 3/10 0.092 1/15 0.001 5/13 0.096 2/17 0.001 2/17 0.001 1/17 0.000 62 | MI09 3/10 0.092 1/15 0.001 5/13 0.096 2/17 0.001 2/17 0.001 1/17 0.000 63 | MI09 3/10 0.092 1/15 0.001 5/13 0.096 2/17 0.001 2/17 0.001 1/17 0.000 64 | MI09 3/10 0.092 1/15 0.001 5/13 0.096 2/17 0.001 2/17 0.001 1/17 0.000 65 | YA11 2/13 0.007 9/ 9 1.185 3/15 0.008 2/16 0.001 2/16 0.001 66 | YA11 2/13 0.007 9/ 9 1.185 3/15 0.008 2/16 0.001 2/16 0.001 67 | YA11 2/13 0.007 9/ 9 1.185 3/15 0.008 2/16 0.001 2/16 0.001 68 | YA11 2/13 0.007 9/ 9 1.185 3/15 0.008 2/16 0.001 2/16 0.001 69 | YA11 2/13 0.007 9/ 9 1.185 3/15 0.008 2/16 0.001 2/16 0.001 70 | SU12 12/ 6 0.238 5/12 0.143 5/12 0.143 2/14 0.004 71 | SU12 12/ 6 0.238 5/12 0.143 5/12 0.143 2/14 0.004 72 | SU12 12/ 6 0.238 5/12 0.143 5/12 0.143 2/14 0.004 73 | SU12 12/ 6 0.238 5/12 0.143 5/12 0.143 2/14 0.004 74 | N 4/14 0.031 2/17 0.001 4/15 0.019 75 | N 4/14 0.031 2/17 0.001 4/15 0.019 76 | N 4/14 0.031 2/17 0.001 4/15 0.019 77 | F 3/13 0.021 4/12 0.077 78 | F 3/13 0.021 4/12 0.077 79 | NF 7/10 0.629 80 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uclnlp/low-rank-logic/2c5686eda9e0c0c389ede5c6e4eea885d14e947c/overview.png -------------------------------------------------------------------------------- /project/Build.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import Keys._ 3 | 4 | object Build extends Build { 5 | val Organization = "uclmr" 6 | val Name = "low-rank-logic" 7 | val Version = "0.1.0-SNAPSHOT" 8 | val ScalaVersion = "2.10.4" 9 | 10 | 11 | lazy val wolfeCore = ProjectRef(file("./wolfe"), "wolfe-core") 12 | lazy val wolfeUtil = ProjectRef(file("./wolfe"), "wolfe-util") 13 | 14 | lazy val root = Project( 15 | "low-rank-logic", 16 | file("."), 17 | settings = Defaults.defaultSettings ++ Seq( 18 | organization := Organization, 19 | name := Name, 20 | version := Version, 21 | scalaVersion := ScalaVersion, 22 | libraryDependencies ++= Seq( 23 | "net.sandrogrzicic" %% "scalabuff-compiler" % "1.3.6", 24 | "net.sandrogrzicic" %% "scalabuff-runtime" % "1.3.6", 25 | "com.google.protobuf" % "protobuf-java" % "2.3.0" 26 | ), 27 | commands ++= Seq(vmargs), 28 | fork in run := true //use a fresh JVM for sbt run 29 | ) 30 | ) dependsOn ( 31 | wolfeCore % "test->test;compile->compile", 32 | wolfeUtil % "test->test;compile->compile"//, 33 | ) 34 | 35 | //utility methods 36 | def vmargs = Command.args("vmargs", "") { 37 | (state, args) => 38 | val javaRunOptions = args.mkString(" ") 39 | println("Applying JVM arguments: " + javaRunOptions) 40 | Project.extract(state).append(javaOptions := Seq(javaRunOptions), state) 41 | } 42 | } -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.mojolly.scalate" % "xsbt-scalate-generator" % "0.5.0") 2 | 3 | addSbtPlugin("org.scalatra.sbt" % "scalatra-sbt" % "0.3.5") -------------------------------------------------------------------------------- /src/main/scala/uclmr/AnnotationTool.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import java.io.{FileInputStream, File, PrintStream, InputStream} 4 | import java.text.SimpleDateFormat 5 | import java.util.Calendar 6 | 7 | import scala.collection.mutable 8 | import scala.collection.mutable.{HashSet, HashMap} 9 | import scala.io.Source 10 | 11 | /** 12 | * @author Sebastian Riedel 13 | */ 14 | object AnnotationTool { 15 | 16 | import ml.wolfe.util.ANSIFormatter.ANSIString 17 | 18 | case class Annotation(tuple: Seq[Any], label: String, correct: Boolean) { 19 | override def toString = (Seq(if (correct) "1" else "0", label) ++ tuple).mkString("\t") 20 | 21 | def fact = tuple -> label 22 | } 23 | 24 | def loadAnnotations(in: InputStream, out: Option[PrintStream] = None) = { 25 | println("Reading in annotations...") 26 | val result = new mutable.HashMap[(Seq[Any], String), Annotation]() 27 | for (line <- Source.fromInputStream(in).getLines()) { 28 | val fields = line.split("\\t") 29 | val correct = fields(0) == "1" 30 | val label = fields(1) 31 | val tuple = fields.drop(2).toSeq 32 | result(Tuple2(tuple,label)) = Annotation(tuple, label, correct) 33 | for (o <- out) o.println(line) 34 | } 35 | result 36 | } 37 | 38 | def loadMentions(mentionFileName: String) = { 39 | val pair2sen = new HashMap[Seq[Any], HashSet[String]] // arg1 -> rel arg1 arg2 40 | val source = Source.fromFile(mentionFileName,"ISO-8859-1") 41 | println("Loading mention file...") 42 | for (line <- source.getLines(); if (!line.startsWith("#Document"))) { 43 | val fields = line.split("\t") 44 | val sen = fields(fields.length - 1) 45 | val sens = pair2sen.getOrElseUpdate(Seq(fields(1), fields(2)), new HashSet[String]) 46 | sens += sen 47 | } 48 | source.close() 49 | pair2sen 50 | } 51 | 52 | 53 | def main(args: Array[String]) { 54 | val sourceName = args(0) 55 | val projDirName = args(1) 56 | val mentionFileName = args(2) 57 | //val pattern = args.lift(3).getOrElse("").r 58 | val pattern = if (args.length > 3) args(3) else ".*" 59 | val previousFileName = args.lift(4).getOrElse("latest.tsv") 60 | println(previousFileName) 61 | val newFileName = args.lift(5).getOrElse({ 62 | val cal = Calendar.getInstance() 63 | val sdf = new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss") 64 | sdf.format(cal.getTime) + ".tsv" 65 | }) 66 | val projDir = new File(projDirName) 67 | projDir.mkdirs() 68 | 69 | val sourceFile = new File(sourceName) 70 | val previousFile = new File(projDir, previousFileName) 71 | val newFile = new File(projDir, newFileName) 72 | val out = new PrintStream(newFile) 73 | 74 | //read in mention file 75 | 76 | val pair2sen = loadMentions(mentionFileName) 77 | 78 | //read in previous file if exists 79 | //Format: Tuple, System, 80 | val annotations = if (previousFile.exists()) 81 | loadAnnotations(new FileInputStream(previousFile), Some(out)) 82 | else 83 | new mutable.HashMap[(Seq[Any], String), Annotation] 84 | println("Previous Annotations: " + annotations.size) 85 | 86 | //set up new softlink 87 | setupSoftlink(new File(projDir, "latest.tsv"), newFile) 88 | 89 | var labelled = 0 90 | 91 | //go through ranked file, and find tuples not yet annotated 92 | for (line <- Source.fromFile(sourceFile).getLines()) { 93 | val lineTmp = line.split("\\|").mkString("\t") 94 | val Array(score, arg1, arg2, freebase, predicted) = lineTmp.split("\\t") 95 | //if (pattern.contains(predicted)) { 96 | //if (predicted.contains(pattern)) { 97 | if (predicted matches pattern) { 98 | val tuple = Seq(arg1, arg2) 99 | annotations.get(Tuple2(tuple, predicted)) match { 100 | case None => 101 | //get sentences 102 | val sentences = pair2sen.getOrElse(tuple, Set.empty) 103 | //ask user 104 | println("*************************************************") 105 | println("Asking for annotation of: " + tuple.mkString(" | ")) 106 | println("Number of annotations: " + labelled) 107 | println("Prediction: " + predicted) 108 | println("Score: " + score) 109 | println("Freebase: " + freebase) 110 | println("Sentences: ") 111 | for (sentence <- sentences) { 112 | var current:String = sentence 113 | var first = true 114 | for (arg <- tuple) { 115 | def render = if (first) arg.toUpperCase.onBlue() else arg.toUpperCase.onRed() 116 | if (current.contains(arg)) { 117 | current = current.replaceAll(arg, if (first) arg.onBlue() else arg.onRed()) 118 | } else if (current.contains(arg.toUpperCase)) { 119 | current = current.replaceAll(arg.toUpperCase, render) 120 | } else if (current.contains(arg.toLowerCase)) { 121 | current = current.replaceAll(arg.toLowerCase, render) 122 | } 123 | first = false 124 | } 125 | println(" " + current) 126 | } 127 | println("Correct (y/N)?: ") 128 | val line = readLine() 129 | val correct = line.trim.toLowerCase == "y" 130 | val annotation = Annotation(tuple, predicted, correct) 131 | out.println(annotation) 132 | out.flush() 133 | 134 | case Some(annotation) => //println(annotation) 135 | } 136 | labelled += 1 137 | } 138 | 139 | } 140 | 141 | 142 | } 143 | 144 | def setupSoftlink(latest: File, newFile: File) { 145 | if (latest.exists()) { 146 | //remove latest file, assuming it's a softlink 147 | latest.delete() 148 | } 149 | import scala.sys.process._ 150 | 151 | ("/bin/ln -s " + newFile.getAbsolutePath + " " + latest.getAbsolutePath).!! 152 | 153 | //Runtime.getRuntime.exec("/bin/ln -s %s %s".format(newFile.getAbsolutePath, latest.getAbsolutePath)) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/EmbeddedProbLogicEvaluation.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import java.io.File 4 | 5 | import cc.factorie.la.DenseTensor1 6 | import com.typesafe.config.ConfigFactory 7 | import ml.wolfe.Wolfe._ 8 | import uclmr.FactorizationUtil.Row 9 | 10 | import scala.collection.mutable 11 | import scala.util.Random 12 | 13 | /** 14 | * @author Sebastian Riedel 15 | */ 16 | object EmbeddedProbLogicEvaluation { 17 | 18 | 19 | def main(args: Array[String]) { 20 | implicit val conf = ConfigFactory.parseFile(new File("conf/epl.conf")) 21 | implicit val random = new Random(0) 22 | assert(!conf.entrySet().isEmpty, "Couldn't find configuration file.") 23 | 24 | def relationFilter(rel: String) = rel.startsWith("path") || (rel.startsWith("REL$") && rel != "REL$NA") 25 | 26 | //load raw data 27 | val trainRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.train")), relationFilter) 28 | val train = FactorizationUtil.filterRows(random.shuffle(trainRaw.toBuffer), conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols")) 29 | 30 | val unlabeledRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.unlabeled")), relationFilter, skipUnlabeled = true) 31 | val unlabeled = FactorizationUtil.filterRows(unlabeledRaw.toSeq, conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols"), !_.startsWith("REL$")) 32 | val combined = if (conf.getBoolean("epl.use-unlabeled")) train ++ unlabeled else train 33 | 34 | 35 | //relations 36 | val trainRelations = combined.flatMap(_.relations.map(_._1)).distinct.sorted // REL$/book/book_edition/author_editor 37 | val freebaseRelations = trainRelations.filter(_.startsWith("REL$")) //Seq("REL$/business/person/company")// 38 | val surfacePatterns = trainRelations.filterNot(_.startsWith("REL$")).toSet 39 | 40 | val testRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.test")), relationFilter, 41 | skipUnlabeled = true, minObsCount = 1).toSeq 42 | val test = FactorizationUtil.filterRows(testRaw, 1, 1, surfacePatterns) 43 | 44 | println(trainRelations.size) 45 | println(train.size) 46 | println(unlabeled.size) 47 | println(test.size) 48 | 49 | val priorRepulsion = conf.getDouble("epl.prior-repulsion") 50 | val priorCounts = Map((true, false) -> priorRepulsion, (false, true) -> priorRepulsion) withDefaultValue 0.0 51 | 52 | println("Extracting Binary rules") 53 | val trainRulesRaw = 54 | if (conf.getBoolean("epl.combine-datasets")) RuleLearner.learn(combined, priorCounts) 55 | else RuleLearner.learn(train, priorCounts) + RuleLearner.learn(unlabeled, priorCounts) 56 | 57 | val cooccurMin = conf.getDouble("epl.min-cooccur") 58 | 59 | println("Finding components") 60 | val connected = RuleFilters.connectedComponents(trainRulesRaw, cooccurMin) 61 | val connectedFreebase = connected.filter(c => freebaseRelations.exists(c._1.nodes)) 62 | println(s"Connected Components: ${ connected.size }") 63 | println(s"Connected Components with freebase relations: ${ connectedFreebase.size }") 64 | println(s"Total count of rules in components: ${ connectedFreebase.view.map(_._2.rules2.size).sum }") 65 | FactorizationUtil.saveToFile( 66 | connected.map(_._1.nodes.toList.sorted.mkString("------\n ", "\n ", "")), 67 | new File("/tmp/components.txt")) 68 | 69 | 70 | val trainRulesBeforeSubsampling = Rules(connectedFreebase.map(_._2.rules2).reduce(_ ++ _)) //RuleFilters.keep2ndOrder(joinedRulesRaw, cooccurMin) 71 | val subsample = conf.getDouble("epl.subsample") 72 | val trainRules = Rules(trainRulesBeforeSubsampling.rules2.filter(p => p._2.cooccurCount >= 1 || random.nextDouble() < subsample)) 73 | //val trainRulesFiltered = trainRules.copy(rules2 = trainRules.) 74 | 75 | println(s"Original rule count: ${ trainRulesRaw.rules2.size }") 76 | println(s"Filtered rule count: ${ trainRules.rules2.size }") 77 | 78 | 79 | FactorizationUtil.saveToFile(trainRules.rules1.values.toSeq.sortBy(_.rel).mkString("\n"), new File("/tmp/rules1.txt")) 80 | FactorizationUtil.saveToFile(trainRules.rules2.values.toSeq.sortBy(-_.probs(true, true)), new File("/tmp/rules2.txt")) 81 | 82 | 83 | println(s"Embedding ${ trainRules.rules2.size } rules") 84 | val ple = ProbLogicEmbedder.embed(trainRules) 85 | 86 | println("Prediction") 87 | val predictedRows = test map (row => ple.predictRow(row, freebaseRelations)) 88 | val predictedFacts = FactorizationUtil.toRankedFacts(test zip predictedRows).filter(_.score > 0.0) 89 | 90 | println(predictedFacts.take(100).mkString("\n")) 91 | FactorizationUtil.saveForUSchemaEval(predictedFacts, new File("/tmp/ple.txt")) 92 | FactorizationUtil.saveToFile(predictedFacts.mkString("\n"), new File("/tmp/ranked.txt")) 93 | 94 | if (conf.getBoolean("epl.print-comparisons")) { 95 | println("Extracting learned rules") 96 | val learnedRules = ple.pairwiseRules(trainRules.rules2.keys) 97 | compareRules(learnedRules, trainRules.rules2) 98 | } 99 | 100 | } 101 | 102 | def compareRules(rules2: Map[(String, String), Rule2], rules1: Map[(String, String), Rule2]) = { 103 | val paired = for (r1 <- rules1.values; r2 <- rules2.get(r1.rel1 -> r1.rel2)) yield (r1, r2, r2.prob1given2Inc(r1)) 104 | val printPaired = paired.toSeq.sortBy(-_._3).view.map(t => s"Mismatch: ${ t._3 }\n${ t._1 }\n${ t._2 }") 105 | val printPairedInv = paired.toSeq.sortBy(_._3).view.map(t => s"Mismatch: ${ t._3 }\n${ t._1 }\n${ t._2 }") 106 | FactorizationUtil.saveToFile(printPaired, new File("/tmp/rule-comparisons.txt")) 107 | FactorizationUtil.saveToFile(printPairedInv, new File("/tmp/rule-comparisons-inv.txt")) 108 | 109 | val avgCondMismatch = paired.view.map(t => math.abs(t._3)).sum / paired.size 110 | println("Average cond. mismatch: " + avgCondMismatch) 111 | 112 | } 113 | 114 | } 115 | 116 | object EmbeddedProbLogicPlayground { 117 | 118 | import scala.math._ 119 | 120 | def manualRules(): Unit = { 121 | implicit val conf = ConfigFactory.parseFile(new File("conf/epl-synth.conf")) withFallback ConfigFactory.parseFile(new File("conf/epl.conf")) 122 | implicit val random = new Random(0) 123 | 124 | 125 | val test = FactorizationUtil.sampleRows(10, 10, 0.2) 126 | val manualData = Seq( 127 | Row("e1", "e2", Seq("r1", "r2").map(_ -> 1.0)), 128 | Row("e3", "e4", Seq("r2", "r3").map(_ -> 1.0)), 129 | Row("e5", "e6", Seq("r4", "r5").map(_ -> 1.0)) 130 | ) 131 | 132 | val dataRelations = test.flatMap(_.observedTrue).distinct.sorted 133 | 134 | 135 | val manualEmbeddings = ProbLogicEmbeddings(Map( 136 | "r0" -> PredicateEmbedding("r0", new DenseTensor1(Array(1.0, 0.0)), 1.0, -2.0, 10.0), 137 | "r1" -> PredicateEmbedding("r1", new DenseTensor1(Array(1.0 / sqrt(2), 1.0 / sqrt(2))), 1.0, 0.0, 1.0) 138 | )) 139 | val ple = manualEmbeddings //ProbLogicEmbedder.embed(manualRules).copy(average = false) //ProbLogicEmbedder.embed(randomRules) 140 | 141 | val rulesData = RuleLearner.learn(manualData) 142 | 143 | val pleData = ProbLogicEmbedder.embed(rulesData) 144 | val learnedRules = pleData.pairwiseRules(rulesData.rules2.keys) 145 | 146 | EmbeddedProbLogicEvaluation.compareRules(learnedRules, rulesData.rules2) 147 | 148 | val predictionsData = for (row <- test) yield { 149 | row.copy(relations = dataRelations.map(r => r -> pleData.predict(row.observedTrue, r))) 150 | } 151 | 152 | println(FactorizationUtil.renderPredictions(predictionsData, test)) 153 | println(pleData.embeddings.values.mkString("\n")) 154 | 155 | } 156 | 157 | def main(args: Array[String]) { 158 | manualRules() 159 | } 160 | 161 | def rulesFromRandomData() { 162 | implicit val conf = ConfigFactory.parseFile(new File("conf/epl-synth.conf")) withFallback ConfigFactory.parseFile(new File("conf/epl.conf")) 163 | implicit val random = new Random(0) 164 | 165 | val randomRows = FactorizationUtil.sampleRows(10, 4, 0.2) 166 | val randomRelations = randomRows.flatMap(_.relations.map(_._1)).distinct.sorted 167 | val randomRules = RuleLearner.learn(randomRows) 168 | 169 | val ple = ProbLogicEmbedder.embed(randomRules) 170 | 171 | val predictions = for (row <- randomRows) yield { 172 | row.copy(relations = randomRelations.map(r => r -> ple.predict(row.observedTrue, r))) 173 | } 174 | 175 | println(randomRules) 176 | 177 | println(FactorizationUtil.renderPredictions(predictions, randomRows)) 178 | 179 | } 180 | } 181 | 182 | 183 | object RuleFilters { 184 | 185 | import scala.math._ 186 | 187 | def keep2ndOrder(rules: Rules, 188 | minCooccurCount: Double) = { 189 | val filtered = rules.rules2.filter(_._2.cooccurCount >= minCooccurCount - 0.0001).map(p => p._1 -> p._2.count * p._2.probs(true, true)) 190 | val graph = filtered ++ filtered.map(p => p.copy(_1 = p._1.swap)) withDefaultValue 0.0 191 | val arg1ToEdges = filtered.toList.groupBy(_._1._1) withDefaultValue Nil 192 | val arg2ToEdges = filtered.toList.groupBy(_._1._2) withDefaultValue Nil 193 | 194 | def expand(graph: Map[(String, String), Double]) = { 195 | //go over all edges (e1,e2) and connect e1 to e3 for each (e2,e3) 196 | //todo: this doesn't find the highest scoring path though 197 | val newEdges = for (((arg1, arg2), s1) <- graph; 198 | ((_, arg3), s2) <- arg1ToEdges(arg2) 199 | if arg3 != arg1 && !graph.contains((arg1, arg3))) yield (arg1, arg3) -> min(s1, s2) 200 | graph ++ newEdges 201 | } 202 | val expanded = expand(graph) 203 | rules.copy(rules2 = rules.rules2.filterKeys(expanded.contains)) 204 | } 205 | 206 | class Component(first: String) { 207 | val edges = new mutable.HashSet[(String, String)] 208 | val nodes = new mutable.HashSet[String] 209 | nodes += first 210 | } 211 | 212 | def connectedComponents(rules: Rules, minCooccurCount: Double, filter: String => Boolean = _ => true) = { 213 | val filtered = rules.rules2.filter(_._2.cooccurCount >= minCooccurCount - 0.0001).map(p => p._1 -> p._2.count * p._2.probs(true, true)) 214 | val graph = filtered ++ filtered.map(p => p.copy(_1 = p._1.swap)) withDefaultValue 0.0 215 | 216 | val components = new mutable.HashMap[String, Component]() 217 | for ((a1, a2) <- graph.keys) { 218 | val c1 = components.getOrElseUpdate(a1, new Component(a1)) 219 | val c2 = components.getOrElseUpdate(a2, new Component(a2)) 220 | if (c1 == c2) { 221 | c1.edges += ((a1, a2)) 222 | } else { 223 | val (keep, discard) = if (c1.nodes.size > c2.nodes.size) (c1, c2) else (c2, c1) 224 | keep.edges += ((a1, a2)) 225 | keep.edges ++= discard.edges 226 | keep.nodes ++= discard.nodes 227 | for (n <- discard.nodes) components(n) = keep 228 | } 229 | } 230 | val filteredComponents = components.values.toList.distinct.view.filter(c => c.nodes.exists(filter)) 231 | filteredComponents.map(c => c -> rules.copy(rules2 = rules.rules2.filterKeys(e => c.nodes(e._1) && c.nodes(e._2)))).toList 232 | 233 | } 234 | 235 | 236 | } 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/EntityAwareEvaluation.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import java.io.File 4 | 5 | import com.typesafe.config.ConfigFactory 6 | import ml.wolfe.Wolfe._ 7 | import uclmr.FactorizationUtil.Row 8 | 9 | import scala.collection.mutable 10 | import scala.util.Random 11 | 12 | /** 13 | * @author Sebastian Riedel 14 | */ 15 | object EntityAwareEvaluation { 16 | 17 | case class Entity(entity: Any, counts: Map[String, Double]) { 18 | override def toString = { 19 | val sorted = counts.toSeq.sortBy(-_._2).map(p => p._1 + " " + p._2) 20 | s""" 21 | |----- 22 | |$entity 23 | |${ sorted.mkString(" ", "\n ", "") } 24 | """.stripMargin 25 | } 26 | 27 | def asArg1 = counts.keys.map(_ + "#1") 28 | def asArg2 = counts.keys.map(_ + "#2") 29 | 30 | 31 | } 32 | 33 | def unaryToBinary(unary: String) = unary.substring(3, unary.length - 2) 34 | 35 | def entitiesFromRows(rows: Seq[Row]) = { 36 | val result = new mutable.HashMap[Any, mutable.HashMap[String, Double]] 37 | for (row <- rows) { 38 | val arg1Counts = result.getOrElseUpdate(row.arg1, new mutable.HashMap[String, Double]()) 39 | val arg2Counts = result.getOrElseUpdate(row.arg2, new mutable.HashMap[String, Double]()) 40 | 41 | for ((rel, value) <- row.relations) { 42 | val a1 = "A1#" + rel 43 | val a2 = "A2#" + rel 44 | arg1Counts(a1) = arg1Counts.getOrElse(a1, 0.0) + value 45 | arg2Counts(a2) = arg1Counts.getOrElse(a2, 0.0) + value 46 | } 47 | } 48 | result.map(p => p._1 -> Entity(p._1, p._2.toMap)).toMap 49 | } 50 | 51 | def joinRules(rules: Seq[Rules]) = { 52 | 53 | val result = new mutable.HashMap[(String, String), Rule2] 54 | val singleCounts = new mutable.HashMap[String, Double] withDefaultValue 0.0 55 | for (ruleMap <- rules.view.map(_.rules2)) { 56 | for (((r1, r2), rule) <- ruleMap) { 57 | result.get((r1, r2)) match { 58 | case Some(oldRule) => 59 | result((r1, r2)) = oldRule + rule 60 | case None => 61 | result((r1, r2)) = rule //todo: we should use updated single counts if seen in previous rule maps 62 | } 63 | } 64 | } 65 | Rules(result.toMap) 66 | } 67 | 68 | import EmbeddedProbLogicEvaluation._ 69 | 70 | def main(args: Array[String]) { 71 | implicit val conf = ConfigFactory.parseFile(new File("conf/epl-ent.conf")) 72 | implicit val random = new Random(0) 73 | assert(!conf.entrySet().isEmpty, "Couldn't find configuration file.") 74 | 75 | val subsample = conf.getDouble("epl.subsample") 76 | val priorRepulsion = conf.getDouble("epl.prior-repulsion") 77 | val cooccurMin = conf.getDouble("epl.min-cooccur") 78 | 79 | 80 | 81 | def relationFilter(rel: String) = rel.startsWith("path") || (rel.startsWith("REL$") && rel != "REL$NA") 82 | 83 | //load raw data 84 | val trainRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.train")), relationFilter) 85 | val train = FactorizationUtil.filterRows(random.shuffle(trainRaw.toBuffer), conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols")) 86 | 87 | val unlabeledRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.unlabeled")), relationFilter, skipUnlabeled = true) 88 | val unlabeled = FactorizationUtil.filterRows(unlabeledRaw.toSeq, conf.getInt("epl.min-rows"), conf.getInt("epl.min-cols"), !_.startsWith("REL$")) 89 | val combined = if (conf.getBoolean("epl.use-unlabeled")) train ++ unlabeled else train 90 | 91 | 92 | //relations 93 | val trainRelations = combined.flatMap(_.relations.map(_._1)).distinct.sorted // REL$/book/book_edition/author_editor 94 | val freebaseRelations = trainRelations.filter(_.startsWith("REL$")) //Seq("REL$/business/person/company")// 95 | val surfacePatterns = trainRelations.filterNot(_.startsWith("REL$")).toSet 96 | 97 | val testRaw = FactorizationUtil.loadLiminFile(new File(conf.getString("epl.test")), relationFilter, 98 | skipUnlabeled = true, minObsCount = 1).toSeq 99 | val test = FactorizationUtil.filterRows(testRaw, 1, 1, surfacePatterns) 100 | 101 | println(trainRelations.size) 102 | println(train.size) 103 | println(unlabeled.size) 104 | println(test.size) 105 | 106 | println("Extracting entities") 107 | val entities = entitiesFromRows(train ++ unlabeled) 108 | FactorizationUtil.saveToFile(entities.values.toSeq.sortBy(_.entity.toString), new File("/tmp/entities.txt")) 109 | //val filteredEntities = entities.mapValues(e => e.copy(counts = e.counts.toSeq.sortBy(-_._2).take(5).toMap)) 110 | val filteredEntities = Map() ++ entities.mapValues(e => e.copy(counts = random.shuffle(e.counts.toSeq).take(20).toMap)) 111 | FactorizationUtil.saveToFile(filteredEntities.values.toSeq.sortBy(_.entity.toString), new File("/tmp/filtered-entities.txt")) 112 | val testEntities = entitiesFromRows(test) 113 | FactorizationUtil.saveToFile(testEntities.values.toSeq.sortBy(_.entity.toString), new File("/tmp/test-entities.txt")) 114 | 115 | 116 | 117 | val priorCounts = Map((true, false) -> priorRepulsion, (false, true) -> priorRepulsion) withDefaultValue 0.0 118 | 119 | println("Extracting Binary rules") 120 | val trainRulesRaw = 121 | if (conf.getBoolean("epl.combine-datasets")) RuleLearner.learn(combined, priorCounts) 122 | else RuleLearner.learn(train, priorCounts) + RuleLearner.learn(unlabeled, priorCounts) 123 | 124 | println("Extracting Unary rules") 125 | val rulesUnary = EntityRuleLearner.extractUnaryRules(filteredEntities, subSample = 0.01) 126 | FactorizationUtil.saveToFile(rulesUnary.rules2.values.toArray.sortBy(-_.probs(true, true)), new File("/tmp/unary.txt")) 127 | println("Extracting Unary-Binary rules") 128 | //val rulesUnary2BinaryTrain = EntityRuleLearner.extractRel2UnaryRules(filteredEntities, train, subSample = 0.01) 129 | val rulesUnary2BinaryCombined = EntityRuleLearner.extractRel2UnaryRules(filteredEntities, train ++ unlabeled, subSample = 0.01) 130 | 131 | //println(rulesUnary2BinaryTrain.rules2.get("A1#path#nsubj|<-nsubj<-have->dobj->|dobj:INV#2" -> "path#nn|<-nn<-station->prep->in->pobj->|pobj")) 132 | //val rulesUnary2BinaryUnlabeled = EntityRuleLearner.extractRel2UnaryRules(filteredEntities, unlabeled, subSample = 0.01) 133 | //println(rulesUnary2BinaryUnlabeled.rules2.get("A1#path#nsubj|<-nsubj<-have->dobj->|dobj:INV#2" -> "path#nn|<-nn<-station->prep->in->pobj->|pobj")) 134 | // val joined = joinRules(Seq(trainRulesRaw,rulesUnary,rulesUnary2BinaryTrain,rulesUnary2BinaryUnlabeled))//trainRulesRaw + rulesUnary + rulesUnary2BinaryTrain + rulesUnary2BinaryUnlabeled 135 | val joined = joinRules(Seq(trainRulesRaw, rulesUnary, rulesUnary2BinaryCombined)) //trainRulesRaw + rulesUnary + rulesUnary2BinaryTrain + rulesUnary2BinaryUnlabeled 136 | println("unary+binary: " + joined.rules2.size) 137 | FactorizationUtil.saveToFile(joined.rules2.values.toSeq.sortBy(-_.cond1given2), new File("/tmp/unary-binary.txt")) 138 | println(joined.rules2.get("A1#path#nsubj|<-nsubj<-have->dobj->|dobj:INV#2" -> "path#nn|<-nn<-station->prep->in->pobj->|pobj")) 139 | 140 | 141 | val trainRules = Rules(joined.rules2.filter(p => p._2.cooccurCount >= 1 || random.nextDouble() < subsample)) 142 | //val trainRulesFiltered = trainRules.copy(rules2 = trainRules.) 143 | 144 | println(s"Original rule count: ${ joined.rules2.size }") 145 | println(s"Filtered rule count: ${ trainRules.rules2.size }") 146 | 147 | 148 | FactorizationUtil.saveToFile(trainRules.rules2.values.toSeq.sortBy(-_.cond1given2), new File("/tmp/ent-rules2.txt")) 149 | 150 | println(s"Embedding ${ trainRules.rules2.size } rules") 151 | val ple = ProbLogicEmbedder.embed(trainRules) 152 | 153 | println("Prediction") 154 | 155 | val predictor = new EntityAwarePredictor(ple, testEntities) 156 | val predictedFacts = test flatMap (row => predictor.predictAll(row, freebaseRelations)) 157 | 158 | FactorizationUtil.saveToFile(predictedFacts.sortBy(-_.fact.score), new File("/tmp/ent-facts.txt")) 159 | 160 | if (conf.getBoolean("epl.print-comparisons")) { 161 | println("Extracting learned rules") 162 | val learnedRules = ple.pairwiseRules(trainRules.rules2.keys) 163 | EmbeddedProbLogicEvaluation.compareRules(learnedRules, trainRules.rules2) 164 | } 165 | } 166 | 167 | } 168 | 169 | object EntityRuleLearner { 170 | 171 | import uclmr.EntityAwareEvaluation._ 172 | 173 | def toRule(rel1: String, rel2: String, 174 | pairCount: Int, singleCount1: Int, singleCount2: Int, normalizer: Double, 175 | priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0) = { 176 | val prob11 = (pairCount + priorCounts(true, true)) / normalizer 177 | val prob10 = ((singleCount1 - pairCount) + priorCounts(true, false)) / normalizer 178 | val prob01 = ((singleCount2 - pairCount) + priorCounts(false, true)) / normalizer 179 | val prob00 = 1.0 - prob11 - prob10 - prob01 180 | val probs = Map( 181 | (true, true) -> prob11, (true, false) -> prob10, 182 | (false, true) -> prob01, (false, false) -> prob00 183 | ) 184 | Rule2(rel1, rel2, probs, 1.0, count = normalizer, 185 | cond1given2 = prob11 / (prob01 + prob11), 186 | cond2given1 = prob11 / (prob10 + prob11)) 187 | } 188 | 189 | def extractUnaryRules(entities: Map[Any, Entity], 190 | subSample: Double = 1.0, 191 | priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0) = { 192 | 193 | val pairCountsArg1 = mutable.HashMap[(String, String), Int]() withDefaultValue 0 194 | val pairCountsArg2 = mutable.HashMap[(String, String), Int]() withDefaultValue 0 195 | val singleCountsInArg1 = mutable.HashMap[String, Int]() withDefaultValue 0 196 | val singleCountsInArg2 = mutable.HashMap[String, Int]() withDefaultValue 0 197 | 198 | println("Entities: " + entities.size) 199 | for (ent <- entities.values) { 200 | for (p <- ent.asArg1) singleCountsInArg1(p) += 1 201 | for (p <- ent.asArg2) singleCountsInArg2(p) += 1 202 | if (ent.entity == "Nevada") { 203 | println(ent.asArg1.mkString(",")) 204 | } 205 | if (ent.entity == "OPEC") { 206 | println(ent.asArg1.mkString(",")) 207 | } 208 | if (ent.asArg1.contains("A1#path#nn|<-nn<-secretary->appos->|appos#1")) { 209 | println(ent.entity) 210 | println("Blah: " + singleCountsInArg1("A1#path#nn|<-nn<-secretary->appos->|appos#1")) 211 | } 212 | for (p1 <- ent.asArg1; p2 <- ent.asArg1; if p1 != p2) { 213 | pairCountsArg1(p1 -> p2) += 1 214 | } 215 | for (p1 <- ent.asArg2; p2 <- ent.asArg2; if p1 != p2) { 216 | pairCountsArg2(p1 -> p2) += 1 217 | } 218 | } 219 | println("Done counting") 220 | val arg1s = singleCountsInArg1.keys.toArray.sorted 221 | val arg2s = singleCountsInArg2.keys.toArray.sorted 222 | val normalizer = entities.size.toDouble + priorCounts.values.sum 223 | val result = new mutable.HashMap[(String, String), Rule2]() 224 | println("Done sorting etc.") 225 | for (i1 <- 0 until arg1s.size; i2 <- i1 + 1 until arg1s.size) { 226 | val a1 = arg1s(i1) 227 | val a2 = arg1s(i2) 228 | if (a2 == "A1#path#nn|<-nn<-secretary->appos->|appos#1") { 229 | println(toRule(a1, a2, pairCountsArg1(a1 -> a2), singleCountsInArg1(a1), singleCountsInArg1(a2), normalizer, priorCounts)) 230 | } 231 | if (pairCountsArg1(a1, a2) >= 1 || random.nextDouble() < subSample) 232 | result(a1 -> a2) = toRule(a1, a2, pairCountsArg1(a1 -> a2), singleCountsInArg1(a1), singleCountsInArg1(a2), normalizer, priorCounts) 233 | } 234 | for (i1 <- 0 until arg2s.size; i2 <- i1 + 1 until arg2s.size) { 235 | val a1 = arg2s(i1) 236 | val a2 = arg2s(i2) 237 | if (a2 == "A1#path#nn|<-nn<-secretary->appos->|appos#2") { 238 | println(toRule(a1, a2, pairCountsArg2(a1 -> a2), singleCountsInArg2(a1), singleCountsInArg2(a2), normalizer, priorCounts)) 239 | } 240 | 241 | if (pairCountsArg2(a1, a2) >= 1 || random.nextDouble() < subSample) 242 | result(a1 -> a2) = toRule(a1, a2, pairCountsArg2(a1 -> a2), singleCountsInArg2(a1), singleCountsInArg2(a2), normalizer, priorCounts) 243 | } 244 | println("Done!") 245 | Rules(result.toMap) 246 | } 247 | 248 | 249 | def extractRel2UnaryRules(entities: Map[Any, Entity], 250 | rows: Seq[Row], 251 | subSample: Double = 1.0, 252 | priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0) = { 253 | 254 | val pairCounts = mutable.HashMap[(String, String), Int]() withDefaultValue 0 255 | val singleCounts = mutable.HashMap[String, Int]() withDefaultValue 0 256 | val singleCountsArgs = mutable.HashMap[String, Int]() withDefaultValue 0 257 | 258 | 259 | for (row <- rows) { 260 | val cells = row.relations 261 | val arg1 = entities(row.arg1) 262 | val arg2 = entities(row.arg2) 263 | for (cell <- cells) singleCounts(cell._1) += 1 264 | for (a1 <- arg1.asArg1) singleCountsArgs(a1) += 1 265 | for (a2 <- arg2.asArg2) singleCountsArgs(a2) += 1 266 | 267 | //we should avoid rules between unary and binary relations that are 268 | for ((rel, _) <- cells; a1 <- arg1.asArg1 if unaryToBinary(a1) != rel) { 269 | pairCounts(rel -> a1) += 1 270 | } 271 | for ((rel, _) <- cells; a2 <- arg2.asArg2 if unaryToBinary(a2) != rel) { 272 | pairCounts(rel -> a2) += 1 273 | } 274 | } 275 | 276 | val normalizer = rows.size.toDouble + priorCounts.values.sum 277 | 278 | println("Done counting") 279 | 280 | val result = new mutable.HashMap[(String, String), Rule2]() 281 | for (rel <- singleCounts.keys; 282 | arg <- singleCountsArgs.keys) { 283 | val (r1, r2, counts1, counts2) = 284 | if (rel.compareTo(arg) < 0) 285 | (rel, arg, singleCounts(rel), singleCountsArgs(arg)) 286 | else 287 | (arg, rel, singleCountsArgs(arg), singleCounts(rel)) 288 | if (pairCounts(r1, r2) >= 1 || random.nextDouble() < subSample) { 289 | // result(rel -> arg) = toRule(rel, arg, pairCounts(rel, arg), singleCounts(rel), singleCountsArgs(arg), normalizer, priorCounts) 290 | result((r1, r2)) = toRule(r1, r2, pairCounts(r1, r2), counts1, counts2, normalizer, priorCounts) 291 | } 292 | 293 | 294 | // if (arg == "A2#path#dobj|<-dobj<-replace->prep->in->pobj->|pobj#2") { 295 | // val rule = toRule(rel, arg, pairCounts(rel, arg), singleCounts(rel), singleCountsArgs(arg), normalizer, priorCounts) 296 | // if (rule.cond1given2 > 0.9) 297 | // println(rule) 298 | // } 299 | 300 | } 301 | println("Done!") 302 | Rules(result.toMap) 303 | } 304 | 305 | } 306 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/EntityAwarePredictor.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import uclmr.EntityAwareEvaluation.Entity 4 | import uclmr.FactorizationUtil.{PredictedFact, Row} 5 | import ml.wolfe.util.Util 6 | 7 | import scala.collection.mutable 8 | 9 | /** 10 | * @author Sebastian Riedel 11 | */ 12 | class EntityAwarePredictor(val embeddings: ProbLogicEmbeddings, val entities: Map[Any, Entity]) { 13 | 14 | val distanceCache = new mutable.HashMap[(String, String), Double]() 15 | 16 | def closest(candidates: Iterable[String], target: String) = { 17 | if (candidates.isEmpty) ("NA", Double.PositiveInfinity) 18 | else 19 | candidates.map(pred => { 20 | val dist = distanceCache.getOrElseUpdate(pred -> target, 21 | embeddings.embeddings(target).distance(embeddings.embeddings(pred))) 22 | pred -> dist 23 | }).minBy(_._2) 24 | } 25 | 26 | def farthest(candidates: Iterable[String], target: String) = { 27 | if (candidates.isEmpty) ("NA", Double.PositiveInfinity) 28 | else 29 | candidates.map(pred => { 30 | val dist = distanceCache.getOrElseUpdate(pred -> target, 31 | embeddings.embeddings(target).distance(embeddings.embeddings(pred))) 32 | pred -> dist 33 | }).maxBy(_._2) 34 | } 35 | 36 | def predictAll(row: Row, targetRelations:Seq[String], useFilter:Boolean = true) = { 37 | targetRelations.map(predict(row,_,useFilter)) 38 | } 39 | 40 | import EntityAwareEvaluation._ 41 | 42 | def predict(row: Row, target: String, useFilter:Boolean = true) = { 43 | val arg1 = entities(row.arg1) 44 | val arg2 = entities(row.arg2) 45 | 46 | val targetEmbedding = embeddings.embeddings(target) 47 | 48 | def filterObs(obs:Iterable[String]) = if (useFilter) obs.filter(targetEmbedding.observationFilter) else obs 49 | def asProb(pair:(String,Double)) = pair.copy(_2 = Util.sig(targetEmbedding.bias - pair._2)) 50 | 51 | //find best unary predicate for arg1 52 | val arg1Result = closest(filterObs(arg1.asArg1), target) 53 | //find best unary predicate for arg2 54 | val arg2Result = closest(filterObs(arg2.asArg2), target) 55 | //find best binary predicate as observation 56 | val relResult = closest(filterObs(row.relations.view.map(_._1)), target) 57 | 58 | val (predictor, score) = Iterator(arg1Result, arg2Result, relResult).maxBy(_._2) 59 | 60 | val prob = Util.sig(targetEmbedding.bias - score) 61 | EntityAwarePrediction( 62 | PredictedFact(row, target, prob), predictor, 63 | asProb(arg1Result), asProb(arg2Result), asProb(relResult) 64 | ) 65 | } 66 | 67 | } 68 | 69 | case class EntityAwarePrediction(fact: PredictedFact, predictor: String, 70 | arg1Result: (String, Double), arg2Result: (String, Double), relResult: (String, Double)) { 71 | override def toString = { 72 | s""" 73 | |$fact 74 | | Predictor: $predictor 75 | | Arg1: $arg1Result 76 | | Arg2: $arg2Result 77 | | Rel: $relResult 78 | """.stripMargin 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/FactorizationUtil.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import java.io._ 4 | 5 | import scala.collection.mutable 6 | import scala.collection.mutable.ArrayBuffer 7 | import scala.io.Source 8 | import scala.util.Random 9 | 10 | /** 11 | * @author Sebastian Riedel 12 | */ 13 | object FactorizationUtil { 14 | 15 | case class Row(arg1: Any, arg2: Any, relations: Seq[(String, Double)], hidden: Set[String] = Set.empty) { 16 | def rowName = s"($arg1,$arg2)" 17 | def observedTrue = relations.filter(_._2 > 0.5).map(_._1) 18 | } 19 | 20 | def sampleRows(rows: Int, rels: Int, density: Double = 0.1)(implicit random: Random) = { 21 | for (pair <- 0 until rows) yield { 22 | val cells = for (rel <- 0 until rels; if random.nextDouble() <= density) yield ("r" + rel, 1.0) 23 | Row(pair.toString, pair.toString, cells) 24 | } 25 | } 26 | 27 | def loadLiminFile(file: File, 28 | relationFilter: String => Boolean = _ => true, 29 | freebaseLabels: Seq[String] = Seq(), minObsCount: Int = 2, skipUnlabeled: Boolean = false): Iterator[Row] = { 30 | val source = Source.fromFile(file, "ISO-8859-1") 31 | for (line <- source.getLines(); 32 | split = line.split("\t"); 33 | arg1 = split(1); 34 | arg2 = split(2); 35 | filteredRelations = split.drop(3).filter(relationFilter) 36 | if filteredRelations.size >= minObsCount && (!skipUnlabeled || split(0) != "UNLABELED") 37 | ) yield { 38 | 39 | val asSet = filteredRelations.toSet 40 | //POSITIVE: entity pair in freebase, and one relation was seen 41 | //NEGATIVE: entity pair in freebase, but no relation was observed, this means that we can 42 | // more confidently label them negative 43 | //UNLABELLED: entity pair not in freebase, in some sense 44 | val cells = split(0) match { 45 | case "POSITIVE" => filteredRelations.map((_, 1.0)) ++ freebaseLabels.filterNot(asSet).map((_, 0.0)) 46 | case "NEGATIVE" => filteredRelations.map((_, 1.0)) ++ freebaseLabels.map((_, 0.0)) 47 | case "UNLABELED" => filteredRelations.map((_, 1.0)) 48 | } 49 | Row(arg1, arg2, cells) 50 | } 51 | } 52 | 53 | def filterRows(rows: Seq[Row], minRowCount: Int = 10, minColCount: Int = 2, relFilter: String => Boolean = _ => true): Seq[Row] = { 54 | //rule: every row should have at least minColCount active cells, and each column needs minRowCount. 55 | val counts = new mutable.HashMap[String, Double]() withDefaultValue 0.0 56 | for (row <- rows; (rel, value) <- row.relations if relFilter(rel)) counts(rel) += value 57 | 58 | for (row <- rows; 59 | cells = row.relations.filter(c => counts(c._1) >= minRowCount) 60 | if cells.size >= minColCount) yield { 61 | row.copy(relations = cells) 62 | } 63 | } 64 | 65 | def filterRowsPairwise(rows: Seq[Row], minPairCount: Int = 3): Seq[Row] = { 66 | //alternative: each relation should have at least one other relation with minPair 67 | val counts = new mutable.HashMap[(String, String), Double]() withDefaultValue 0.0 68 | for (row <- rows; 69 | (rel1, value1) <- row.relations; 70 | (rel2, value2) <- row.relations if rel1 != rel2) counts(rel2 -> rel2) += value1 * value2 71 | val maxCounts = counts.toSeq.groupBy(_._1._1).mapValues(_.view.map(_._2).max) 72 | for (row <- rows; 73 | cells = row.relations.filter(c => maxCounts(c._1) >= minPairCount)) yield row.copy(relations = cells) 74 | } 75 | 76 | 77 | case class PredictedFact(row: Row, relation: String, score: Double) { 78 | override def toString = s"$score\t$relation\t${ row.rowName }\t${ row.observedTrue.mkString(" ") }" 79 | def toUSchemaString = s"$score\t${ row.arg1 }\t${ row.arg2 }\tREL${ "$NA" }\t$relation" 80 | } 81 | 82 | def toRankedFacts(predictions: Seq[(Row, Row)]): Seq[PredictedFact] = { 83 | val facts = for ((obs, guess) <- predictions; (rel, value) <- guess.relations) yield PredictedFact(obs, rel, value) 84 | val sorted = facts.sortBy(-_.score) 85 | sorted 86 | } 87 | 88 | def saveToFile(content: String, file: File): Unit = { 89 | val out = new PrintStream(file) 90 | out.println(content) 91 | out.close() 92 | } 93 | 94 | def saveToFile[T](content: Iterable[T], file: File): Unit = { 95 | val out = new PrintWriter(new BufferedWriter(new FileWriter(file))) 96 | for (line <- content) 97 | out.println(line.toString) 98 | out.close() 99 | } 100 | 101 | 102 | def saveForUSchemaEval(facts: Seq[PredictedFact], file: File): Unit = { 103 | val out = new PrintStream(file) 104 | for (fact <- facts) { 105 | out.println(fact.toUSchemaString) 106 | } 107 | out.close() 108 | } 109 | 110 | def renderPredictions(prediction: Seq[Row], truth: Seq[Row] = Seq.empty) = { 111 | import ml.wolfe.util.ANSIFormatter._ 112 | val relations = 113 | (prediction.flatMap(_.relations.map(_._1)) ++ truth.flatMap(_.relations.map(_._1))).distinct.sorted 114 | val colWidth = math.max(relations.map(_.toString.length).max + 1, 5) 115 | val firstColWidth = prediction.map(_.rowName.length).max + 1 116 | 117 | val colFormat = "%" + colWidth + "s" 118 | val firstColFormat = "%" + firstColWidth + "s" 119 | val cellFormat = "%" + (colWidth - 1) + "s " 120 | val pFormat = "%4.2f" 121 | 122 | val sb = new mutable.StringBuilder() 123 | sb ++= " " * firstColWidth 124 | relations.foreach(col => sb ++= colFormat.format(col)) 125 | sb ++= "\n" 126 | 127 | val truthMap = truth.map(r => (r.arg1, r.arg2) -> r).toMap 128 | 129 | for (row <- prediction) { 130 | val trueRow = truthMap.get((row.arg1, row.arg2)) 131 | sb ++= firstColFormat.format(row.rowName) + " " 132 | val col2value = row.relations.toMap withDefaultValue 0.0 133 | val col2trueValue = trueRow.map(_.relations.toMap).getOrElse(Map.empty) 134 | for (col <- relations) { 135 | val score = col2value(col) 136 | val pString = cellFormat.format(pFormat.format(score)) 137 | val actualString = col2trueValue.get(col) match { 138 | case Some(value) => if (value > 0.5) pString.onGreen() else pString 139 | case None => pString 140 | } 141 | sb ++= actualString 142 | } 143 | sb ++= "\n" 144 | 145 | } 146 | sb.toString() 147 | 148 | } 149 | 150 | def filterRankedFile(dest: String, filterTuple: String, source: String) { 151 | val allowed = new mutable.HashSet[Seq[Any]]() 152 | 153 | val out = new PrintStream(dest) 154 | 155 | for (line <- Source.fromFile(filterTuple).getLines(); if line.trim != "") { 156 | val split = line.split("\t") 157 | val tuple = if (split.size == 2) Seq(split(0), split(1)) else Seq(split(1), split(2)) 158 | allowed += tuple 159 | } 160 | println(allowed.size) 161 | 162 | def norm(label: String) = if (label.contains("/") && !label.startsWith("REL$")) "REL$" + label else label 163 | 164 | for (line <- Source.fromFile(source).getLines()) { 165 | val split = line.split("[\t]") 166 | if (split(1).contains("|")) { 167 | val tuple = split(1).split("\\|").toSeq 168 | if (allowed(tuple)) out.println(split(0) + "\t" + tuple.mkString("\t") + "\t" + split.drop(2).map(norm).mkString("\t")) 169 | } else { 170 | val tuple = Seq(split(1), split(2)) 171 | if (allowed(tuple)) out.println(split.take(3).mkString("\t") + "\t" + split.drop(3).map(norm).mkString("\t")) 172 | } 173 | } 174 | 175 | out.close() 176 | } 177 | 178 | def main(args: Array[String]) { 179 | filterRankedFile( 180 | "/tmp/ple-subsample.txt", 181 | "/Users/sriedel/projects/spdb/naacl2013/nyt-freebase.test.subsample-10000.tuples.txt", 182 | "/tmp/ple.txt" 183 | ) 184 | } 185 | 186 | 187 | } 188 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/LogicalInference.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import uclmr.CellType.CellType 4 | 5 | /** 6 | * @author rockt 7 | * Very basic logical inference. Assumes that the formulaList or the formulae in db are consistent. 8 | * todo: can be sped up by not touching premises twice 9 | */ 10 | object LogicalInference { 11 | def apply(db: TensorDB, formulaList: List[Formula] = Nil, newCellType: CellType = CellType.Train, usePredictions: Boolean = false, threshold: Double = 0.5): Unit = { 12 | var converged = false 13 | 14 | val formulae = if (formulaList.isEmpty) db.formulae.toList else formulaList 15 | while (!converged) { 16 | converged = true 17 | 18 | for (formula <- formulae) formula match { 19 | case Impl(p1, p2, _) => 20 | val cs = if (usePredictions) db.getPredictedBy1(p1, threshold) else db.getBy1(p1) 21 | cs.foreach(c => { 22 | val (c1, c2) = c 23 | val cellOpt = db.get(p2, c1, c2) 24 | 25 | if (!cellOpt.isDefined || cellOpt.get.cellType != newCellType) { 26 | converged = false 27 | db += Cell(p2, c1, c2, target = 1.0, cellType = newCellType) 28 | } 29 | }) 30 | case ImplNeg(p1, p2, _) => 31 | val cs = if (usePredictions) db.getPredictedBy1(p1, threshold) else db.getBy1(p1) 32 | cs.foreach(c => { 33 | val (c1, c2) = c 34 | val cellOpt = db.get(p2, c1, c2) 35 | 36 | if (!cellOpt.isDefined || cellOpt.get.cellType != newCellType) { 37 | converged = false 38 | db += Cell(p2, c1, c2, target = 0.0, cellType = newCellType) 39 | } 40 | }) 41 | case _ => ??? 42 | } 43 | } 44 | 45 | } 46 | } 47 | 48 | object LogicalInferenceSpec extends App { 49 | val k = 5 50 | val db = new TensorKB(k) 51 | db.sampleTensor(10, 10, 0, 0.1) 52 | db.toFactorGraph 53 | 54 | db += Cell("r6", "e6", DefaultIx, 0.0, CellType.Test) 55 | 56 | db += Impl("r4", "r6") 57 | db += Impl("r6", "r2") 58 | 59 | println(db.toVerboseString()) 60 | 61 | //fixme: second baseline actually needs to go over *predicted* true premises 62 | LogicalInference(db, newCellType = CellType.Inferred, usePredictions = true, threshold = 0.49) 63 | //LogicalInference(db, newCellType = CellType.Inferred) 64 | 65 | 66 | println(db.toVerboseString()) 67 | 68 | println("Inferred cells:\n" + db.inferredCells.mkString("\n")) 69 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/MatrixFactorization.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import java.io.{File, FileWriter} 4 | 5 | import cc.factorie.la.SparseBinaryTensor1 6 | import cc.factorie.optimize._ 7 | import uclmr.io._ 8 | import ml.wolfe.fg.{L2Regularization, _} 9 | import ml.wolfe.util.{Conf, ProgressLogging, Timer} 10 | import ml.wolfe.{DenseVector, GradientBasedOptimizer, Wolfe} 11 | 12 | import scala.io.Source 13 | import scala.util.Random 14 | 15 | /** 16 | * @author Sebastian Riedel 17 | * @author rockt 18 | */ 19 | 20 | object MatrixFactorization extends App { 21 | val mf = new MatrixFactorization(args.lift(0).getOrElse("conf/mf.conf")) 22 | val wMAP = mf.run() 23 | println(wMAP) 24 | } 25 | 26 | class MatrixFactorization(confPath: String = "conf/mf.conf") { 27 | val debug = false //whether to use a small synthetic matrix or actual data 28 | val loadFormulae = debug && true //whether forumlae should be sampled for debugging 29 | //val print = false //whether to print the matrix (only do this for small ones!) 30 | 31 | Conf.add(confPath) 32 | Conf.outDir //sets up output directory 33 | implicit val conf = Conf 34 | println("Using " + confPath) 35 | 36 | val dataType = conf.getString("dataType") 37 | assert(dataType == "naacl" || dataType == "figer" || dataType == "tsv", s"dataType $dataType should be 'naacl' or 'figer' or 'tsv'.") 38 | val useFeatures = (dataType == "figer" && conf.getBoolean("figer.use-features")) || (dataType == "naacl" && conf.getBoolean("mf.use-features")) 39 | 40 | val outputPath = conf.getString("outDir") 41 | val fileName = conf.getString("mf.outFile") 42 | 43 | val mode = conf.getString("mf.mode") 44 | 45 | //model parameters 46 | val k = conf.getInt("mf.k") 47 | val lambda = conf.getDouble("mf.lambda") 48 | val alpha = conf.getDouble("mf.alpha") 49 | val maxIter = conf.getInt("mf.maxIter") 50 | 51 | val subsample = conf.getDouble("mf.subsample") 52 | val negPerPos = conf.getInt("mf.negPerPos") 53 | val unobservedPerF = conf.getInt("mf.unobservedPerF") 54 | 55 | val cellWeight = conf.getDouble("mf.cellWeight") 56 | val formulaeWeight = conf.getDouble("mf.formulaeWeight") 57 | 58 | val optimizer = conf.getString("mf.optimizer") 59 | val batchTraining = conf.getBoolean("mf.batchTraining") 60 | 61 | val bpr = conf.getBoolean("mf.bpr") 62 | 63 | val postInferenceThreshold = 0.5 64 | 65 | 66 | val db = if (debug) { 67 | val tmp = new TensorKB(4) 68 | tmp.sampleTensor(10, 10, 0, 0.1) //samples a matrix 69 | //tmp += Cell("r3", "r3-#premise") 70 | //tmp += Cell("r4", "r4-#consequent") 71 | if (loadFormulae) { 72 | tmp += Impl("r3", "r4") 73 | tmp += Impl("r4", "r6") 74 | tmp += Impl("r6", "r2") 75 | //tmp += ImplNeg("r8", "r6") 76 | } 77 | tmp 78 | } else dataType match { 79 | case "naacl" => LoadNAACL(k, subsample) 80 | case "figer" => LoadFIGER(k, subsample) 81 | case "tsv" => LoadTSV(k, subsample) 82 | } 83 | 84 | val rand = new Random(0l) 85 | 86 | val fg = db.toFactorGraph 87 | 88 | val trainDebugString = if (debug) db.toVerboseString(showTrain = true) else "" 89 | if (mode == "pre-inference" || mode == "pre-post-inference") LogicalInference(db, newCellType = CellType.Train) 90 | 91 | val data = rand.shuffle(db.trainCells) 92 | val colNodes = db.ix1ToNodeMap //cols 93 | val rowNodes = db.ix2ToNodeMap //rows 94 | 95 | //initialize embeddings 96 | //def nextInit() = (rand.nextDouble() - 0.5) * 0.1 97 | def nextInit() = rand.nextGaussian() * 0.1 98 | (colNodes.values.view ++ rowNodes.values.view).foreach(n => 99 | n.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray)) 100 | if (useFeatures) db match { 101 | case f: Features => 102 | f.fwnodes1.foreach(n => n.variable.asVector.b = new DenseVector((0 until f.numFeatures1).map(i => nextInit()).toArray)) 103 | f.fwnodes2.foreach(n => n.variable.asVector.b = new DenseVector((0 until f.numFeatures2).map(i => nextInit()).toArray)) 104 | } 105 | 106 | 107 | 108 | //fact factors 109 | for (d <- data) { 110 | val (colIx, rowIx, _) = d.key 111 | val r = rowNodes(rowIx) //entity 112 | val c = colNodes(colIx) //relation 113 | 114 | if (bpr) fg.buildStochasticFactor(Seq(r, db.sampleNodeFrom2(colIx), c))(_ map (_ => new VectorMsgs)) { 115 | e => new BPRPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization 116 | } 117 | else { 118 | if (useFeatures) db match { 119 | case dbf: Features => { 120 | // assumes only features on rows (weights for each column) 121 | val fwnode = dbf.fwnode2(colIx).get 122 | val fnode = dbf.fnode2(rowIx).get 123 | fg.buildFactor(Seq(r, c, fwnode, fnode))(_ map (_ => new VectorMsgs)) { 124 | e => new CellLogisticLossWithRowFeatures(e(0), e(1), e(2), e(3), 1.0, lambda, cellWeight) with L2Regularization 125 | } 126 | 127 | (0 until negPerPos).foreach { i => 128 | fg.buildStochasticFactor({ 129 | val nr = db.sampleNodeFrom2(colIx) 130 | val nrfnode = dbf.fnode2(nr.variable.label).get 131 | Seq(nr, c, fwnode, nrfnode) 132 | })(_ map (_ => new VectorMsgs)) { 133 | e => new CellLogisticLossWithRowFeatures(e(0), e(1), e(2), e(3), 0.0, lambda, cellWeight / negPerPos) with L2Regularization 134 | } 135 | } 136 | } 137 | } else { 138 | fg.buildFactor(Seq(r, c))(_ map (_ => new VectorMsgs)) { 139 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, cellWeight) with L2Regularization 140 | } 141 | 142 | (0 until negPerPos).foreach { i => 143 | fg.buildStochasticFactor(Seq(c, db.sampleNodeFrom2(colIx)))(_ map (_ => new VectorMsgs)) { 144 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, cellWeight / negPerPos) with L2Regularization 145 | } 146 | } 147 | } 148 | } 149 | } 150 | 151 | if (mode == "low-rank-logic") { 152 | //formulae factors 153 | for (d <- data) { 154 | //colIx: relation 155 | //rowIx: entity 156 | val (colIx, rowIx, _) = d.key 157 | 158 | val a = rowNodes(rowIx) 159 | val v = colNodes(colIx) 160 | 161 | for (formula <- db.formulaeByPredicate(colIx)) { 162 | val cNode = v 163 | if (formula.isFormula2) { 164 | val Seq(p1, p2) = formula.predicates 165 | 166 | //can only inject formulae whose predicates exist 167 | if (db.node1(p1).isDefined && db.node1(p2).isDefined) { 168 | val p1Node = db.node1(p1).get 169 | val p2Node = db.node1(p2).get 170 | 171 | formula match { 172 | case Impl(_, _, target) => 173 | fg.buildFactor(Seq(cNode, p1Node, p2Node))(_ map (_ => new VectorMsgs)) { 174 | e => new ImplPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization 175 | } 176 | (0 until unobservedPerF).foreach { i => 177 | fg.buildStochasticFactor(Seq(db.sampleNodeFrom2(colIx, sampleTestRows = Conf.getBoolean("mf.test-row-terms")), p1Node, p2Node))(_ map (_ => new VectorMsgs)) { 178 | e => new ImplPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization 179 | } 180 | } 181 | 182 | case ImplNeg(_, _, target) => 183 | fg.buildFactor(Seq(cNode, p1Node, p2Node))(_ map (_ => new VectorMsgs)) { 184 | e => new ImplNegPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization 185 | } 186 | (0 until unobservedPerF).foreach { i => 187 | fg.buildStochasticFactor(Seq(db.sampleNodeFrom2(colIx, sampleTestRows = Conf.getBoolean("mf.test-row-terms")), p1Node, p2Node))(_ map (_ => new VectorMsgs)) { 188 | e => new ImplNegPotential(e(0), e(1), e(2), target, lambda, formulaeWeight) with L2Regularization 189 | } 190 | } 191 | } 192 | } 193 | } else { 194 | ??? 195 | } 196 | } 197 | } 198 | } 199 | if (mode == "gen-fake-data") { 200 | //formulae factors 201 | var fidx = 0 202 | val numFakeData = conf.getInt("mf.num-fake-data") 203 | val weight = numFakeData * formulaeWeight 204 | for (formula <- db.formulae) { 205 | if (formula.isFormula2) { 206 | val Seq(p1, p2) = formula.predicates 207 | //can only inject formulae whose predicates exist 208 | if (db.node1(p1).isDefined && db.node1(p2).isDefined) { 209 | val p1Node = db.node1(p1).get 210 | val p2Node = db.node1(p2).get 211 | 212 | formula match { 213 | case Impl(_, _, target) => { 214 | println("Adding fake cells for formula: " + p1.toString + " -> " + p2.toString) 215 | val e11 = fg.addVectorNode(k, "e11+" + fidx) 216 | e11.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray) 217 | fg.buildFactor(Seq(p1Node, e11))(_ map (_ => new VectorMsgs)) { 218 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization 219 | } 220 | fg.buildFactor(Seq(p2Node, e11))(_ map (_ => new VectorMsgs)) { 221 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization 222 | } 223 | val e01 = fg.addVectorNode(k, "e01+" + fidx) 224 | e01.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray) 225 | fg.buildFactor(Seq(p1Node, e01))(_ map (_ => new VectorMsgs)) { 226 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 227 | } 228 | fg.buildFactor(Seq(p2Node, e01))(_ map (_ => new VectorMsgs)) { 229 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization 230 | } 231 | val e00 = fg.addVectorNode(k, "e00+" + fidx) 232 | e00.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray) 233 | fg.buildFactor(Seq(p1Node, e00))(_ map (_ => new VectorMsgs)) { 234 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 235 | } 236 | fg.buildFactor(Seq(p2Node, e00))(_ map (_ => new VectorMsgs)) { 237 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 238 | } 239 | } 240 | case ImplNeg(_, _, target) => { 241 | println("Adding fake cells for formula: " + p1.toString + " -> !" + p2.toString) 242 | val e10 = fg.addVectorNode(k, "e10+" + fidx) 243 | e10.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray) 244 | fg.buildFactor(Seq(p1Node, e10))(_ map (_ => new VectorMsgs)) { 245 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization 246 | } 247 | fg.buildFactor(Seq(p2Node, e10))(_ map (_ => new VectorMsgs)) { 248 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 249 | } 250 | val e01 = fg.addVectorNode(k, "e01+" + fidx) 251 | e01.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray) 252 | fg.buildFactor(Seq(p1Node, e01))(_ map (_ => new VectorMsgs)) { 253 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 254 | } 255 | fg.buildFactor(Seq(p2Node, e01))(_ map (_ => new VectorMsgs)) { 256 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda, weight) with L2Regularization 257 | } 258 | val e00 = fg.addVectorNode(k, "e00+" + fidx) 259 | e00.variable.asVector.b = new DenseVector((0 until k).map(i => nextInit()).toArray) 260 | fg.buildFactor(Seq(p1Node, e00))(_ map (_ => new VectorMsgs)) { 261 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 262 | } 263 | fg.buildFactor(Seq(p2Node, e00))(_ map (_ => new VectorMsgs)) { 264 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, weight) with L2Regularization 265 | } 266 | } 267 | } 268 | } 269 | fidx += 1 270 | } else { 271 | ??? 272 | } 273 | } 274 | } 275 | 276 | fg.build() 277 | 278 | 279 | println("DB:" + db.toInfoString) 280 | println("FG:" + fg.toInspectionString) 281 | 282 | val gradientOptimizer = optimizer match { 283 | case "SGD" => new ConstantLearningRate(baseRate = alpha) 284 | case "AdaGrad" => new AdaGrad(rate = alpha) 285 | case "AdaMira" => new AdaMira(rate = alpha) //rockt: doesn't seem to make a difference to AdaGrad 286 | case "LBFGS" => new LBFGS(Double.MaxValue, Int.MaxValue) //rockt: not working atm 287 | case "AvgPerceptron" => new AveragedPerceptron() 288 | } 289 | 290 | 291 | def run(): Double = { 292 | println("Optimizing...") 293 | Timer.time("optimization") { 294 | if (mode != "inference-only") 295 | GradientBasedOptimizer(fg, 296 | if (batchTraining) new BatchTrainer(_, gradientOptimizer, maxIter) with ProgressLogging 297 | else new OnlineTrainer(_, gradientOptimizer, maxIter, fg.factors.size - 1) with ProgressLogging 298 | ) 299 | 300 | if (mode == "post-inference" || mode == "pre-post-inference") 301 | LogicalInference(db, newCellType = CellType.Inferred, usePredictions = true, threshold = postInferenceThreshold) 302 | if (mode == "inference-only") 303 | LogicalInference(db, newCellType = CellType.Inferred) 304 | } 305 | println("Done after " + Timer.reportedVerbose("optimization")) 306 | 307 | var wMAP = 0.0 308 | 309 | if (debug) { 310 | println("train:") 311 | println(trainDebugString) 312 | println() 313 | println("predicted:") 314 | println(db.toVerboseString()) 315 | if (Conf.getBoolean("serialize")) db.serialize(Conf.outDir.getAbsolutePath + "/serialized/") 316 | } else { 317 | Conf.createSymbolicLinkToLatest() //rewire symbolic link to latest (in case it got overwritten) 318 | val pathToPredict = Conf.outDir.getAbsolutePath + "/" + fileName 319 | dataType match { 320 | case "naacl" => 321 | WriteNAACL(db, pathToPredict) 322 | val evalConf = "./conf/" + Conf.getString("evalConf") 323 | wMAP = new EvaluateNAACL(evalConf, pathToPredict).eval() 324 | case "figer" => 325 | WriteFIGER(db, pathToPredict) 326 | EvaluateFIGER.main(Array(pathToPredict, Conf.outDir.getAbsolutePath)) 327 | case "tsv" => //todo: write out predictions (for all cells?) 328 | } 329 | 330 | //db.writeVectors(Conf.outDir.getAbsolutePath + "/vectors.tsv") 331 | 332 | if (Conf.getBoolean("serialize")) db.serialize(Conf.outDir.getAbsolutePath + "/serialized/") 333 | 334 | 335 | 336 | import scala.sys.process._ 337 | Process("pdflatex -interaction nonstopmode -shell-escape table.tex", new File(Conf.outDir.getAbsolutePath)).!! 338 | 339 | if (Conf.hasPath(dataType + ".formulaeFile") && Conf.getString(dataType + ".formulaeFile") != "None") { 340 | val formulaeFile = new File(Conf.getString(dataType + ".formulaeFile")) 341 | val lines = Source.fromFile(formulaeFile).getLines() 342 | val writer = new FileWriter(Conf.outDir.getAbsolutePath + "/" + formulaeFile.getAbsolutePath.split("/").last) 343 | writer.write(lines.mkString("\n")) 344 | writer.close() 345 | } 346 | } 347 | 348 | wMAP 349 | } 350 | } 351 | 352 | object WolfeStyleMF extends App { 353 | 354 | import ml.wolfe.Wolfe._ 355 | import ml.wolfe.macros.OptimizedOperators._ 356 | case class Data(rel:String, arg1:String, arg2:String, target:Double) 357 | 358 | case class Model(relationVectors:Map[String,Seq[Double]], entityPairVectors:Map[(String,String),Seq[Double]]) 359 | 360 | def dot(a1:Seq[Double],a2:Seq[Double]) = ??? 361 | 362 | val rels = Seq("profAt") 363 | val ents = Seq("Luke" -> "MIT") 364 | 365 | 366 | def searchSpace(k:Int) = all(Model)(maps(rels,fvectors(k)) x maps(ents,fvectors(k))) 367 | 368 | def fvectors(k:Int) = Wolfe.seqsOfLength(k,Wolfe.doubles) 369 | 370 | 371 | 372 | //@Potential(???) //cell logistic potential 373 | def logisticLoss(target:Double, arg1:Seq[Double], arg2:Seq[Double]) = 374 | //todo: sigmoid 375 | sum(0 until arg1.length) { i => arg1(i) * arg2(i) } 376 | 377 | //@Stochastic(String => (String, String)) //samples a non-observed pair efficiently from data; not for now 378 | //creates as many stochastic factors as the integer before the sum 379 | @Stochastic 380 | def negativeDataLoss(data: Seq[Data])(model: Model) = { 381 | val r = data.head.rel 382 | val numObserved = data.size //function of r 383 | val numUnobserved = ents.size - numObserved 384 | 385 | //there needs to be a default implementation that takes the filtered domain (ents) and samples from it 386 | numObserved * sum(ents filter { pair => !data.exists(d => pair == (d.arg1, d.arg2)) }){ pair => 387 | logisticLoss(0.0, model.entityPairVectors(pair), model.relationVectors(r)) * (numUnobserved / numObserved.toDouble) 388 | } 389 | } 390 | 391 | def objective(data:Seq[Data])(model:Model) = { 392 | sum(data) { d => logisticLoss(d.target,model.entityPairVectors(d.arg1 -> d.arg2), model.relationVectors(d.rel)) } + 393 | sum(rels) { r => negativeDataLoss(data.filter(_.rel == r))(model) } 394 | } 395 | 396 | println("It compiles, yay! :)") 397 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/PimpMyFactorie.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import cc.factorie.la._ 4 | import scala.Array 5 | import cc.factorie.util.SparseDoubleSeq 6 | import scala.language.implicitConversions 7 | 8 | /** 9 | * @author rockt 10 | */ 11 | class PimpMyFactorie { 12 | 13 | } 14 | 15 | 16 | object PimpMyFactorie { 17 | //FIXME: for some reason this methods is not applied implicitly 18 | implicit def applyElementwise(fun: Double => Double): (Tensor => Tensor) = { 19 | (tensor: Tensor) => 20 | for ((ix, value) <- tensor.activeElements) tensor.update(ix, fun(value)) 21 | tensor 22 | } 23 | 24 | implicit class PimpedTensor(self: Tensor) { 25 | def toPrettyString: String = self match { 26 | //case sparse: SparseDoubleSeq => sparse.activeElements.map(t => t._1 + "\t" + t._2).mkString("\n") 27 | case tensor1: Tensor1 => tensor1.asArray.mkString("\n") 28 | case tensor2: Tensor2 => (0 until tensor2.dim1).map(row => (0 until tensor2.dim2).map(col => tensor2(row, col)).mkString(" ")).mkString("\n") 29 | case tensor3: Tensor3 => 30 | (0 until tensor3.dim2).map(row => (0 until tensor3.dim1).map(layer => 31 | (0 until tensor3.dim3).map(col => tensor3(layer, row, col)).mkString(" ") 32 | ).mkString(" | ")).mkString("\n") 33 | } 34 | def toDimensionsString: String = self match { 35 | case tensor1: Tensor1 => tensor1.dim1.toString 36 | case tensor2: Tensor2 => s"${tensor2.dim1}×${tensor2.dim2}" 37 | case tensor3: Tensor3 => s"${tensor3.dim1}×${tensor3.dim2}×${tensor3.dim3}" 38 | } 39 | def vectorization: Tensor1 = new DenseTensor1(self.asArray) 40 | 41 | /** 42 | * Two tensors are equal if the have the same dimensions and values 43 | */ 44 | def ===(obj: scala.Any): Boolean = (obj, self) match { 45 | case (other: Tensor1, self: Tensor1) => { 46 | if (other.dim1 != self.dim1) false 47 | else { 48 | for (i <- 0 until self.dim1) 49 | if (self(i) != other(i)) return false 50 | true 51 | } 52 | } 53 | case (other: Tensor2, self: Tensor2) => { 54 | if (other.dim1 != self.dim1) false 55 | if (other.dim2 != self.dim2) false 56 | else { 57 | for { 58 | i <- 0 until self.dim1 59 | j <- 0 until self.dim2 60 | } 61 | if (self(i,j) != other(i,j)) return false 62 | true 63 | } 64 | } 65 | case (other: Tensor3, self: Tensor3) => { 66 | if (other.dim1 != self.dim1) false 67 | if (other.dim2 != self.dim2) false 68 | if (other.dim3 != self.dim3) false 69 | else { 70 | for { 71 | i <- 0 until self.dim1 72 | j <- 0 until self.dim2 73 | k <- 0 until self.dim3 74 | } 75 | if (self(i,j,k) != other(i,j,k)) return false 76 | true 77 | } 78 | } 79 | case _ => self.equals(obj) 80 | } 81 | } 82 | 83 | implicit class PimpedTensor1(self: Tensor1) { 84 | def t: Tensor2 = new DenseTensor2(Array(self.asArray)) 85 | def slice(from: Int, to: Int): Tensor1 = new DenseTensor1(self.asArray.slice(from, to)) 86 | def *(tensor2: Tensor2): Tensor1 = tensor2.leftMultiply(self) 87 | 88 | def <>(tensor1: Tensor1): Tensor2 = self.outer(tensor1).asInstanceOf[Tensor2] 89 | } 90 | 91 | /** 92 | * Pimped tensor2 with dim1 = rows, dim2 = columns 93 | */ 94 | implicit class PimpedTensor2(self: Tensor2) { 95 | /** 96 | * Returns the transpose of the matrix 97 | */ 98 | def t: Tensor2 = { 99 | new DenseTensor2(self) { 100 | override protected def _initialArray: Array[Double] = self.asArray 101 | override val dim1 = self.dim2 102 | override val dim2 = self.dim1 103 | override def apply(i: Int, j: Int): Double = self.apply(j, i) 104 | } 105 | } 106 | 107 | //TODO: make this more efficient 108 | def multiply(other: Tensor2): Tensor2 = { 109 | require(self.dim2 == other.dim1, s"${self.dim1}x${self.dim2} * ${other.dim1}x${other.dim2}") 110 | val tmp = new DenseTensor2(self.dim1, other.dim2) 111 | for { 112 | i <- 0 until self.dim1 113 | j <- 0 until other.dim2 114 | } tmp.update(i, j, (for (k <- 0 until self.dim2) yield self(i, k) * other(k, j)).sum) 115 | tmp 116 | } 117 | 118 | //rockt: inefficient? 119 | def reshape(rows: Int, columns: Int): Tensor2 = { 120 | require(rows * columns == self.dim1 * self.dim2) 121 | new DenseTensor2(self.asSeq.grouped(columns).toArray) 122 | } 123 | 124 | /** 125 | * Updates the ith column with tensor1 126 | */ 127 | def update(i: Int, tensor1: Tensor1) = { 128 | require(self.dim1 == tensor1.dim1) 129 | for (j <- 0 until self.dim1) self.update(j, i, tensor1(j)) 130 | } 131 | 132 | def mul(value: Double): Tensor2 = (self * value).asInstanceOf[Tensor2] 133 | 134 | def getRow(ix: Int): Tensor1 = 135 | new DenseTensor1((for (i <- 0 until self.dim2) yield self(ix, i)).toArray) 136 | 137 | //only works if tensor2 is a SparseBinaryTensor2 138 | def getSparseRow(ix: Int): SparseTensor1 = { 139 | val matrix = self.asInstanceOf[SparseBinaryTensor2] 140 | val v = new SparseTensor1(self.dim2) 141 | 142 | val minIx = ix * self.dim2 143 | val maxIx = (ix + 1) * self.dim2 144 | 145 | val elems = matrix.activeElements.filter(p => minIx <= p._1 && p._1 < maxIx) 146 | 147 | elems.foreach(p => { 148 | val (ix, value) = p 149 | v.update(ix % self.dim2, value) 150 | }) 151 | 152 | v 153 | } 154 | } 155 | 156 | /** 157 | * Pimped tensor3 with dim1 = layers, dim2 = rows, dim3 = columns 158 | */ 159 | implicit class PimpedTensor3(self: Tensor3) { 160 | /** 161 | * Multitplies the tensor with a vector in mode 1, i.e., inner product with every mode 1 (tube) fiber. 162 | * TODO: generalize this to mode 2 and mode 3 163 | * TODO: this method is the performance bottleneck: use DenseLayeredTensor3 and pick the vectors you need! 164 | * TODO: is there a parallel implementation for this? 165 | */ 166 | def firstModeVectorProduct(tensor1: Tensor1): Tensor2 = tensor1 match { 167 | case t: SparseTensor => 168 | //FIXME: this is currently not general, since it only works for calculating the 2*1 score 169 | require(self.dim2 == 2 && self.dim3 == 1) 170 | val result = new DenseTensor2(self.dim2, self.dim3) 171 | var sum0 = 0.0 172 | var sum1 = 0.0 173 | 174 | t.activeElements.foreach(elem => { 175 | val (ix, value) = elem 176 | sum0 += self(ix, 0, 0) * value 177 | sum1 += self(ix, 1, 0) * value 178 | }) 179 | 180 | result.update(0, 0, sum0) 181 | result.update(1, 0, sum1) 182 | result 183 | case _ => 184 | //println(tensor1.getClass) 185 | //require(self.dim1 == tensor1.dim1, s"${self.toDimensionsString} * ${tensor1.toDimensionsString}") 186 | require(self.dim1 == tensor1.dim1) 187 | 188 | val tensor2 = new DenseTensor2(self.dim2, self.dim3) 189 | var i = 0 190 | var j = 0 191 | var k = 0 192 | while(j tensor1 346 | case tensor2: Tensor2 if tensor2.dim1 == 1 || tensor2.dim2 == 1 => new DenseTensor1(tensor2.asArray) 347 | case _ => throw new scala.MatchError("I don't know how to transform this into a Tensor1: " + tensor) 348 | } 349 | 350 | implicit def tensor1ToTensor2(tensor1: Tensor1): Tensor2 = { 351 | val tensor2 = new DenseTensor2(tensor1.dim1, 1) { 352 | _setArray(tensor1.asArray) 353 | } 354 | tensor2 355 | } 356 | 357 | //TODO: speed this up! 358 | def tensor3ToTensor2(tensor3: Tensor3): Tensor2 = { 359 | require(tensor3.dim3 == 1) 360 | val matrix = new DenseTensor2(tensor3.dim2, tensor3.dim1) 361 | for { 362 | i <- 0 until tensor3.dim1 363 | j <- 0 until tensor3.dim2 364 | } matrix update(j, i, tensor3(i, j, 0)) 365 | 366 | matrix 367 | } 368 | 369 | //TODO: speed this up! 370 | def tensorToTensor3(tensor: Tensor): Tensor3 = tensor match { 371 | case tensor2: Tensor2 => 372 | val tensor3 = new DenseTensor3(tensor2.dim2, tensor2.dim1, 1) 373 | for { 374 | i <- 0 until tensor2.dim1 375 | j <- 0 until tensor2.dim2 376 | } tensor3.update(j, i, 0, tensor2(i, j)) 377 | tensor3 378 | case _ => throw new scala.MatchError("I don't know how to transform this into a Tensor3: " + tensor) 379 | } 380 | 381 | def featureMatrixToTensor3(tensor: Tensor, featureDim: Int): Tensor3 = tensor match { 382 | case tensor2: Tensor2 => 383 | //val numActive = tensor2.activeElements.size 384 | val tensor3 = new SparseIndexedTensor3(featureDim + 1, tensor2.dim1, 1) 385 | /*{ 386 | super.ensureCapacity(numActive) 387 | override def ensureCapacity(cap: Int): Unit = true 388 | }*/ 389 | //println("outer tensor: " + tensor2.toDimensionsString) 390 | //println("new tensor: " + tensor3.toDimensionsString) 391 | //println("outer: " + tensor2.toPrettyString) 392 | for { 393 | i <- tensor2.activeDomain1 394 | j <- tensor2.activeDomain2 395 | } { 396 | //println(i,j) 397 | tensor3.update(j, i, 0, tensor2(i, j)) 398 | } 399 | tensor3 400 | } 401 | 402 | val TENSOR3_ONE = { 403 | val one = new DenseTensor3(1,1,1) 404 | one update (0, 1.0) 405 | one 406 | } 407 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/ProbLogicEmbeddings.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import cc.factorie.la.DenseTensor1 4 | import cc.factorie.model.WeightsSet 5 | import cc.factorie.optimize._ 6 | import com.typesafe.config.Config 7 | import ml.wolfe.Wolfe._ 8 | import ml.wolfe._ 9 | import uclmr.FactorizationUtil.Row 10 | import ml.wolfe.fg.VectorMsgs 11 | import ml.wolfe.util.Util 12 | 13 | import scala.collection.mutable 14 | import scala.util.Random 15 | 16 | /** 17 | * @author Sebastian Riedel 18 | */ 19 | case class PredicateEmbedding(rel: String, embedding: FactorieVector, 20 | scale: Double, bias: Double, weight: Double = 1.0, 21 | observationFilter: String => Boolean = _ => true) { 22 | def distance(that: PredicateEmbedding) = { 23 | Util.sq(embedding.l2Similarity(that.embedding)) 24 | } 25 | } 26 | 27 | case class ProbLogicEmbeddings(embeddings: Map[String, PredicateEmbedding], 28 | rules: Rules = Rules(Map.empty, Map.empty), 29 | average: Boolean = true, forceZero: Boolean = true, 30 | usel2dist: Boolean = false, minWhenUsingL2Dist: Boolean = true) { 31 | 32 | 33 | def predict(observations: Seq[String], relation: String) = { 34 | embeddings.get(relation) match { 35 | case None => 0.0 36 | case Some(embedding) => 37 | val filteredObs = observations.filter(embedding.observationFilter) 38 | if (!forceZero || filteredObs.size > 0) { 39 | val normalizer = if (average) filteredObs.size.toDouble else 1.0 40 | var score = embedding.bias 41 | if (!usel2dist) for (obs <- filteredObs; obsEmb <- embeddings.get(obs)) { 42 | score += obsEmb.weight * embedding.scale * (embedding.embedding dot obsEmb.embedding) / normalizer // observations.size 43 | } else { 44 | //take the average 45 | if (minWhenUsingL2Dist) { 46 | val distances = for (obs <- filteredObs.view; obsEmb <- embeddings.get(obs).view) yield { 47 | Util.sq(obsEmb.embedding.l2Similarity(embedding.embedding)) 48 | } 49 | score -= distances.min 50 | } else { 51 | val result = new DenseTensor1(embedding.embedding) 52 | for (obs <- filteredObs; obsEmb <- embeddings.get(obs)) { 53 | result +=(obsEmb.embedding, -1.0 / normalizer) // observations.size 54 | } 55 | score -= result.twoNormSquared 56 | } 57 | } 58 | val result = Util.sig(score) 59 | result 60 | } else 0.0 61 | } 62 | } 63 | def predictRow(observation: Row, targets: Seq[String]) = { 64 | observation.copy(relations = targets.map(r => r -> predict(observation.observedTrue, r))) 65 | } 66 | 67 | def pairwiseRules(relPairs: Iterable[(String, String)]) = { 68 | val relations = embeddings.keys.toArray.sorted 69 | val marginals = (for (r <- relations) yield r -> predict(Seq.empty, r)).toMap 70 | val result = for ((rel1, rel2) <- relPairs; 71 | emb1 = embeddings(rel1); 72 | emb2 = embeddings(rel2)) yield { 73 | val prob1given2 = predict(Seq(rel2), rel1) 74 | val prob2given1 = predict(Seq(rel1), rel2) 75 | val prob1 = marginals(rel1) 76 | val prob2 = marginals(rel2) 77 | val probs = Map( 78 | (true, true) -> prob1given2 * prob2, //todo: this may be different to using the other way around 79 | (true, false) -> (1.0 - prob2given1) * prob1, 80 | (false, true) -> (1.0 - prob1given2) * prob2, 81 | (false, false) -> (1 - prob1) * (1 - prob2) 82 | ) 83 | (rel1, rel2) -> Rule2( 84 | rel1, rel2, probs, 85 | trueTrueInconsistency = math.abs(prob1given2 * prob2 - prob2given1 * prob1), 86 | cond1given2 = prob1given2, cond2given1 = prob2given1) 87 | } 88 | result.toMap 89 | } 90 | 91 | } 92 | 93 | case class Rules(rules2: Map[(String, String), Rule2], rules1: Map[String, Rule1] = Map.empty) { 94 | lazy val rel2RuleArg1 = rules2.toSeq.groupBy(_._1._1) withDefaultValue Seq.empty 95 | lazy val rel2RuleArg2 = rules2.toSeq.groupBy(_._1._2) withDefaultValue Seq.empty 96 | 97 | lazy val relations = rules2.keySet.map(_._1) ++ rules2.keySet.map(_._2) 98 | 99 | def pairwiseRuleCount(rel: String) = rel2RuleArg1(rel).size + rel2RuleArg2(rel).size 100 | 101 | def +(that: Rules): Rules = { 102 | val result = new mutable.HashMap[(String, String), Rule2] 103 | for ((pair, r1) <- rules2) { 104 | that.rules2.get(pair) match { 105 | case Some(r2) => result(pair) = r1 + r2 106 | case None => result(pair) = r1 107 | } 108 | } 109 | for ((pair, r2) <- that.rules2) if (!result.contains(pair)) result(pair) = r2 110 | copy(rules2 = result.toMap) 111 | 112 | } 113 | 114 | def withPriorCounts(priorCounts: Map[(Boolean, Boolean), Double]) = { 115 | val normalizer = priorCounts.values.sum 116 | val probs = priorCounts.mapValues(_ / normalizer) 117 | val rule = Rule2("r1", "r2", probs, count = normalizer, cond1given2 = 0, cond2given1 = 0) 118 | copy(rules2 = rules2.mapValues(_ + rule)) 119 | } 120 | 121 | } 122 | 123 | 124 | case class Rule2(rel1: String, rel2: String, probs: Map[(Boolean, Boolean), Double], scale: Double = 1, 125 | count: Double = 1.0, trueTrueInconsistency: Double = 0.0, cond1given2: Double, cond2given1: Double) { 126 | 127 | assert(cond1given2 <= 1.0) 128 | def marg1(b1: Boolean) = probs(b1, true) + probs(b1, false) 129 | def marg2(b2: Boolean) = probs(true, b2) + probs(false, b2) 130 | def prob2given1(b1: Boolean)(b2: Boolean) = probs(b1, b2) / marg1(b1) 131 | def prob1given2(b2: Boolean)(b1: Boolean) = probs(b1, b2) / marg2(b2) 132 | 133 | def cooccurCount = count * probs(true, true) 134 | override def toString = 135 | s"""$rel1 $rel2 ${ if (trueTrueInconsistency > 0.0) "(" + trueTrueInconsistency.toString + ")" else "" } 136 | |p(r1|r2) = ${ cond1given2 } 137 | |p(r2|r1) = ${ cond2given1 } 138 | |p(r1) = ${ marg1(true) } 139 | |p(r2) = ${ marg2(true) } 140 | """.stripMargin 141 | 142 | lazy val mutualInformation = { 143 | probs(true, true) * math.log(probs(true, true) / (marg1(true) * marg2(true))) + 144 | probs(true, false) * math.log(probs(true, false) / (marg1(true) * marg2(false))) + 145 | probs(false, true) * math.log(probs(false, true) / (marg1(false) * marg2(true))) + 146 | probs(false, false) * math.log(probs(false, false) / (marg1(false) * marg2(false))) 147 | } 148 | 149 | def +(that: Rule2) = { 150 | val newCount = count + that.count 151 | def newProb(b1: Boolean, b2: Boolean) = (probs(b1, b2) * count + that.probs(b1, b2) * that.count) / newCount 152 | val newProbs = Map( 153 | (true, true) -> newProb(true, true), 154 | (true, false) -> newProb(true, false), 155 | (false, true) -> newProb(false, true), 156 | (false, false) -> newProb(false, false) 157 | ) 158 | copy(probs = newProbs, count = newCount) 159 | } 160 | 161 | 162 | def klTerm(p1: Double, p2: Double) = if (p1 == 0.0) 0.0 else p1 * math.log(p1 / p2) 163 | 164 | def prob1given2Inc(that: Rule2) = cond1given2 - that.cond1given2 165 | 166 | def condKL(that: Rule2) = { 167 | klTerm(prob1given2(true)(true), that.prob1given2(true)(true)) + 168 | klTerm(prob1given2(true)(false), that.prob1given2(true)(false)) + 169 | klTerm(prob2given1(true)(true), that.prob2given1(true)(true)) + 170 | klTerm(prob2given1(true)(false), that.prob2given1(true)(false)) 171 | } 172 | 173 | def kl(that: Rule2) = { 174 | klTerm(probs(true, true), that.probs(true, true)) + 175 | klTerm(probs(true, false), that.probs(true, false)) + 176 | klTerm(probs(false, true), that.probs(false, true)) + 177 | klTerm(probs(false, true), that.probs(false, false)) 178 | } 179 | 180 | } 181 | case class Rule1(rel: String, prob: Double) 182 | 183 | object RuleInjector { 184 | def injectImplication(rule: Rule2, forward: Boolean = true): Rule2 = { 185 | forward match { 186 | case true => 187 | val probs = Map((true, true) -> (rule.probs(true, true) + rule.probs(true, false)), (true, false) -> 0.0) 188 | rule.copy(probs = rule.probs ++ probs) 189 | case false => 190 | val probs = Map((true, true) -> (rule.probs(true, true) + rule.probs(false, true)), (false, true) -> 0.0) 191 | rule.copy(probs = rule.probs ++ probs) 192 | } 193 | } 194 | } 195 | 196 | object ProbLogicEmbedder { 197 | 198 | def embed(rules: Rules)(implicit conf: Config): ProbLogicEmbeddings = { 199 | 200 | import ml.wolfe.FactorGraph.Node 201 | 202 | val random = new Random(0) 203 | val relations = rules.rules2.values.flatMap(r => Seq(r.rel1, r.rel2)).distinct.sorted.toArray 204 | val numRelations = relations.size 205 | val fg = new FactorGraph 206 | val k = conf.getInt("epl.relation-dim") 207 | val regW = conf.getDouble("epl.reg-embed") 208 | val regS = conf.getDouble("epl.reg-scale") 209 | val regBias = conf.getDouble("epl.reg-bias") 210 | val regMult = conf.getDouble("epl.reg-mult") 211 | val doNormB = conf.getBoolean("epl.norm-b") 212 | val scalePrior = conf.getDouble("epl.scale-prior") 213 | val biasPrior = conf.getDouble("epl.bias-prior") 214 | val multPrior = conf.getDouble("epl.mult-prior") 215 | val weighTerms = conf.getBoolean("epl.weigh-terms") 216 | val unitBall = conf.getBoolean("epl.unit-ball") 217 | val l2dist = conf.getBoolean("epl.l2-dist") 218 | 219 | val maxMarg = rules.rules2.view.flatMap(t => Iterator(t._2.marg1(true), t._2.marg2(true))).max 220 | 221 | val V = relations.map(r => r -> fg.addVectorNode(k, r)).toMap 222 | if (unitBall) for (n <- V.values) n.variable.asVector.unitVector = true 223 | def emptyMap = Map.empty[String, Node] withDefaultValue null 224 | val colScales = if (regS == Double.PositiveInfinity) emptyMap else relations.map(i => i -> fg.addVectorNode(1)).toMap 225 | val colBiases = if (regBias == Double.PositiveInfinity) emptyMap else relations.map(i => i -> fg.addVectorNode(1)).toMap 226 | val colMults = if (regMult == Double.PositiveInfinity) emptyMap else relations.map(i => i -> fg.addVectorNode(1)).toMap 227 | 228 | //initialize 229 | for (n <- V.values; i <- 0 until k) n.variable.asVector.b(i) = random.nextGaussian() * 1.0 230 | for (n <- colScales.values) n.variable.asVector.b(0) = scalePrior 231 | for (n <- colBiases.values) n.variable.asVector.b(0) = biasPrior 232 | for (n <- colMults.values) n.variable.asVector.b(0) = multPrior 233 | 234 | val numberOfTerms = numRelations * (numRelations - 1) / 2.0 235 | val objNormalizer = 1.0 / numberOfTerms 236 | 237 | println("Building factor graph") 238 | 239 | var numJointFactors = 0 240 | for (rel1Index <- 0 until relations.length; rel2Index <- rel1Index + 1 until relations.size) { 241 | val rel1 = relations(rel1Index) 242 | val rel2 = relations(rel2Index) 243 | 244 | val v1 = V(rel1) 245 | val v2 = V(rel2) 246 | 247 | val s1 = colScales(rel1) 248 | val s2 = colScales(rel2) 249 | 250 | val eta1 = colBiases(rel1) 251 | val eta2 = colBiases(rel2) 252 | 253 | val m1 = colMults(rel1) 254 | val m2 = colMults(rel2) 255 | 256 | val relNormalizer1 = rules.pairwiseRuleCount(rel1) 257 | val relNormalizer2 = rules.pairwiseRuleCount(rel2) 258 | 259 | rules.rules2.get((rel1, rel2)) match { 260 | case Some(rule) => 261 | if (!l2dist) fg.buildFactor(Seq(v1, eta1, s1, m1, v2, eta2, s2, m2))(_ map (_ => new VectorMsgs)) { 262 | e => new JointPotential( 263 | e(0), e(1), e(2), e(3), 264 | e(4), e(5), e(6), e(7), 265 | rule.prob1given2(true)(true), rule.prob2given1(true)(true), 266 | rule.marg1(true), rule.marg2(true), 267 | regW, regBias, regS, regMult, 268 | biasPrior, scalePrior, multPrior, 269 | 1.0 / relNormalizer1, 1.0 / relNormalizer2, 270 | if (weighTerms) rule.marg1(true) / maxMarg else 1.0, 271 | if (weighTerms) rule.marg2(true) / maxMarg else 1.0) 272 | } else 273 | fg.buildFactor(Seq(v1, v2, eta1, eta2))(_ map (_ => new VectorMsgs)) { 274 | e => new L2DistanceBasedPotential( 275 | e(0), e(1), e(2), e(3), 276 | rule.prob1given2(true)(true), rule.prob2given1(true)(true), 277 | 1.0, 278 | regW, regBias, biasPrior, 279 | 1.0 / relNormalizer1, 1.0 / relNormalizer2) 280 | } 281 | //if (numJointFactors == 0) PotentialDebugger.checkGradients(factor.potential, debug = true) 282 | numJointFactors += 1 283 | 284 | case _ => 285 | } 286 | } 287 | 288 | fg.build() 289 | println(s"Optimizing... with ${ fg.factors.size } terms") 290 | 291 | val maxIterations = conf.getInt("epl.opt-iterations") 292 | 293 | 294 | // val step = new AdaGrad(conf.getDouble("epl.ada-rate")) with UnitBallProjection 295 | val step = new AdaMira(conf.getDouble("epl.ada-rate")) with UnitBallProjection 296 | //val step = new LBFGS() with UnitBallProjection 297 | 298 | 299 | def trainer(weightsSet: WeightsSet) = conf.getString("epl.trainer") match { 300 | case "batch" => new BatchTrainer(weightsSet, step, maxIterations) 301 | case "online" => new OnlineTrainer(weightsSet, step, maxIterations) 302 | } 303 | 304 | GradientBasedOptimizer(fg, trainer(_), step) 305 | //GradientBasedOptimizer(fg, new BatchTrainer(_, new LBFGS(), maxIterations)) 306 | 307 | //allowed observations for each predicate are only the relations we have seen together with the predicate 308 | val allowed = new mutable.HashMap[String, mutable.HashSet[String]]() 309 | for ((r1, r2) <- rules.rules2.keys) { 310 | allowed.getOrElseUpdate(r1, new mutable.HashSet[String]) += r2 311 | allowed.getOrElseUpdate(r2, new mutable.HashSet[String]) += r1 312 | } 313 | //val allPairs = rules.rules2.keySet.flatMap(p => Set(p, p.swap)) 314 | //val allowed = allPairs.groupBy(_._1).mapValues(_.map(_._2)) 315 | val embeddings = relations.map({ rel => 316 | rel -> PredicateEmbedding(rel, 317 | V(rel).variable.asVector.b, 318 | if (regS == Double.PositiveInfinity) scalePrior else colScales(rel).variable.asVector.b(0), 319 | if (regBias == Double.PositiveInfinity) biasPrior else colBiases(rel).variable.asVector.b(0), 320 | if (regMult == Double.PositiveInfinity) multPrior else colMults(rel).variable.asVector.b(0), 321 | allowed(rel)) 322 | }) 323 | ProbLogicEmbeddings(embeddings.toMap, rules, usel2dist = l2dist) 324 | } 325 | 326 | } 327 | 328 | 329 | object RuleLearner { 330 | def learn(rows: Seq[Row], priorCounts: Map[(Boolean, Boolean), Double] = Map.empty withDefaultValue 0.0): Rules = { 331 | val pairCounts = mutable.HashMap[(String, String), Int]() withDefaultValue 0 332 | val singleCounts = mutable.HashMap[String, Int]() withDefaultValue 0 333 | 334 | for (row <- rows) { 335 | val cells = row.relations 336 | for (cell <- cells) singleCounts(cell._1) += 1 337 | for (i <- 0 until cells.size; j <- i + 1 until cells.size) { 338 | //todo: more sensible to sort relations here instead adding two versions. 339 | pairCounts(cells(i)._1 -> cells(j)._1) += 1 340 | pairCounts(cells(j)._1 -> cells(i)._1) += 1 341 | } 342 | } 343 | 344 | val relations = singleCounts.keys.toArray.sorted 345 | val normalizer = rows.size.toDouble + priorCounts.values.sum 346 | val rules2 = for (r1 <- 0 until relations.size; r2 <- r1 + 1 until relations.size) yield { 347 | val rel1 = relations(r1) 348 | val rel2 = relations(r2) 349 | val pairCount = pairCounts((rel1, rel2)) 350 | val singleCount1 = singleCounts(rel1) 351 | val singleCount2 = singleCounts(rel2) 352 | val prob11 = (pairCount + priorCounts(true, true)) / normalizer 353 | val prob10 = ((singleCount1 - pairCounts(rel1, rel2)) + priorCounts(true, false)) / normalizer 354 | val prob01 = ((singleCount2 - pairCounts(rel1, rel2)) + priorCounts(false, true)) / normalizer 355 | val prob00 = 1.0 - prob11 - prob10 - prob01 356 | val probs = Map( 357 | (true, true) -> prob11, (true, false) -> prob10, 358 | (false, true) -> prob01, (false, false) -> prob00 359 | ) 360 | (rel1, rel2) -> Rule2(rel1, rel2, probs, 1.0, count = normalizer, 361 | cond1given2 = prob11 / (prob01 + prob11), 362 | cond2given1 = prob11 / (prob10 + prob11)) 363 | } 364 | val rules1 = for ((r, c) <- singleCounts) yield r -> Rule1(r, c / normalizer) 365 | Rules(rules2.toMap, rules1.toMap) 366 | } 367 | } 368 | 369 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/SoftLogicPotentials.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import ml.wolfe.FactorGraph.Edge 4 | import ml.wolfe._ 5 | import ml.wolfe.fg.{Regularization, Potential} 6 | import ml.wolfe.util.Conf 7 | 8 | /** 9 | * A potential for a formula containing two predicates 10 | * @param constEdge edge to variable that refers to a constant 11 | * @param predicate1Edge edge to first predicate 12 | * @param predicate2Edge edge to second predicate 13 | * @param target target 14 | * @param lambda regularization parameter 15 | * @author rockt 16 | */ 17 | abstract class Formula2Potential(constEdge: Edge, predicate1Edge: Edge, predicate2Edge: Edge, target: Double = 1.0, 18 | val lambda: Double = 0.0, weight: Double = 1.0) extends Potential with Regularization { 19 | def cVar = constEdge.n.variable.asVector 20 | def p1Var = predicate1Edge.n.variable.asVector 21 | def p2Var = predicate2Edge.n.variable.asVector 22 | val cMsgs = constEdge.msgs.asVector 23 | val p1Msgs = predicate1Edge.msgs.asVector 24 | val p2Msgs = predicate2Edge.msgs.asVector 25 | 26 | def sig(x: Double) = 1.0 / (1.0 + math.exp(-x)) 27 | 28 | private def innerLossAndDirection(s: Double): (Double, Int) = 29 | if (target >= s) (1 + s - target, 1) 30 | else (1 + target - s, -1) 31 | 32 | override def valueForCurrentSetting(): Double = { 33 | val c = cVar.setting 34 | val p1 = p1Var.setting 35 | val p2 = p2Var.setting 36 | val p1c = sig(c dot p1) 37 | val p2c = sig(c dot p2) 38 | 39 | val s = F(p1c, p2c) 40 | 41 | val loss = innerLossAndDirection(s)._1 42 | math.log(loss) * weight + regLoss(c) + regLoss(p1) + regLoss(p2) 43 | } 44 | 45 | override def valueAndGradientForAllEdges(): Double = { 46 | val p1c = sig(cMsgs.n2f dot p1Msgs.n2f) 47 | val p2c = sig(cMsgs.n2f dot p2Msgs.n2f) 48 | 49 | val s = F(p1c, p2c) 50 | 51 | val (loss, dir) = innerLossAndDirection(s) 52 | 53 | val p1c_p1 = cMsgs.n2f * p1c * (1 - p1c) 54 | val p1c_c = p1Msgs.n2f * p1c * (1 - p1c) 55 | val p2c_p2 = cMsgs.n2f * p2c * (1 - p2c) 56 | val p2c_c = p2Msgs.n2f * p2c * (1 - p2c) 57 | 58 | p1Msgs.f2n = (calcF_p1(p1c_p1, p2c) * (1.0 / loss) * dir) * weight + regGradient(p1Msgs.n2f) 59 | p2Msgs.f2n = (calcF_p2(p2c_p2, p1c) * (1.0 / loss) * dir) * weight + regGradient(p2Msgs.n2f) 60 | cMsgs.f2n = 61 | if (Conf.getBoolean("mf.inject-rows")) 62 | (calcF_c(p2c_c, p1c, p1c_c, p2c) * (1.0 / loss) * dir) * weight + regGradient(cMsgs.n2f) 63 | else 64 | new SparseVector(cMsgs.n2f.length) 65 | 66 | 67 | math.log(loss) * weight + regLoss(cMsgs.n2f) + regLoss(p1Msgs.n2f) + regLoss(p2Msgs.n2f) 68 | } 69 | 70 | /** 71 | * Calculates the score of a formula F that contains two predicates p1, p2. 72 | * @param p1c score of [p1(c)] 73 | * @param p2c score of [p2(c)] 74 | * @return score of [F] 75 | */ 76 | def F(p1c: Double, p2c: Double): Double 77 | /** 78 | * Calculates gradient of [p1] in formula F. 79 | * @param p1c_p1 gradient of [p1] in [p1(c)] 80 | * @param p2c score of [p2(c)] 81 | * @return gradient of [p1] 82 | */ 83 | def calcF_p1(p1c_p1: FactorieVector, p2c: Double): FactorieVector 84 | /** 85 | * Calculates gradient of [p2] in formula F. 86 | * @param p2c_p2 gradient of [p2] in [p2(c)] 87 | * @param p1c score of [p1(c)] 88 | * @return gradient of [p2] 89 | */ 90 | def calcF_p2(p2c_p2: FactorieVector, p1c: Double): FactorieVector 91 | /** 92 | * Calculates gradient of [c] in formula F. 93 | * @param p2c_c gradient of [c] in [p2(c)] 94 | * @param p1c score of [p1(c)] 95 | * @param p1c_c gradient of [c] in [p1(c)] 96 | * @param p2c score of [p2(c)] 97 | * @return gradient of [c] 98 | */ 99 | def calcF_c(p2c_c: FactorieVector, p1c: Double, p1c_c: FactorieVector, p2c: Double): FactorieVector 100 | } 101 | 102 | 103 | class ImplPotential(constEdge: Edge, pred1Edge: Edge, pred2Edge: Edge, target: Double = 1.0, override val lambda: Double = 0.0, weight: Double = 1.0) 104 | extends Formula2Potential(constEdge, pred1Edge, pred2Edge, target, lambda, weight) { 105 | //[p₁(c) => p₂(c)] := [p₁(c)]*([p₂(c)] - 1) + 1 106 | def F(p1c: Double, p2c: Double) = p1c * (p2c - 1) + 1 107 | def calcF_p1(p1c_p1: FactorieVector, p2c: Double) = p1c_p1 * (p2c - 1) 108 | def calcF_p2(p2c_p2: FactorieVector, p1c: Double) = p2c_p2 * p1c 109 | def calcF_c(p2c_c: FactorieVector, p1c: Double, p1c_c: FactorieVector, p2c: Double) = 110 | p2c_c * p1c + p1c_c * (p2c - 1) 111 | } 112 | 113 | 114 | class ImplNegPotential(constEdge: Edge, pred1Edge: Edge, pred2Edge: Edge, target: Double = 1.0, override val lambda: Double = 0.0, weight: Double = 1.0) 115 | extends Formula2Potential(constEdge, pred1Edge, pred2Edge, target, lambda, weight) { 116 | //[p₁(c) => ¬p₂(c)] := [p₁(c)]*(-[p₂(c)]) + 1 117 | def F(p1c: Double, p2c: Double) = p1c * -p2c + 1 118 | def calcF_p1(p1c_p1: FactorieVector, p2c: Double) = p1c_p1 * -p2c 119 | def calcF_p2(p2c_p2: FactorieVector, p1c: Double) = p2c_p2 * -p1c 120 | def calcF_c(p2c_c: FactorieVector, p1c: Double, p1c_c: FactorieVector, p2c: Double) = 121 | p2c_c * -p1c + p1c_c * -p2c 122 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/future/MatrixFactorization2.scala: -------------------------------------------------------------------------------- 1 | package uclmr.future 2 | 3 | import cc.factorie.la.DenseTensor1 4 | import cc.factorie.optimize.{AdaGrad, BatchTrainer} 5 | import ml.wolfe.fg20._ 6 | import ml.wolfe.util.Math._ 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * @author rockt 12 | */ 13 | object MatrixFactorization2 extends App { 14 | val k = 5 15 | val cols = Array("r1", "r2", "r3", "r4", "r5").map(new VectVar(k, _)) 16 | val rows = Array("e1", "e2", "e3", "e4", "e5").map(new VectVar(k, _)) 17 | 18 | val initialState = new MapBasedState( 19 | (cols ++ rows).map(_ -> new DenseTensor1((0 until k).map(i => random.nextGaussian() * 0.1).toArray)).toMap 20 | ) 21 | 22 | val data = Array( 23 | Array(1, 0, 1, 0, 1), 24 | Array(1, 1, 0, 0, 1), 25 | Array(0, 0, 0, 1, 0), 26 | Array(1, 0, 1, 0, 0), 27 | Array(1, 0, 0, 0, 1) 28 | ) 29 | 30 | val potentials = 31 | (0 until rows.length).flatMap(r => { 32 | (0 until cols.length).collect { 33 | case c if data(r)(c) == 1 => 34 | new FlatSum[Differentiable](Seq( 35 | new MFLogisticPotential(rows(r), cols(c)), 36 | new L2Regularization(0.01, rows(r), cols(c)) 37 | )) with DifferentiableSum 38 | } 39 | }) 40 | 41 | val stochasticPotentials = 42 | (0 until rows.length).flatMap(r => { 43 | (0 until cols.length).collect { 44 | case c if data(r)(c) == 1 => 45 | def sampledRow = rows(random.nextInt(rows.length)) 46 | 47 | new FlatSum[Differentiable](Seq( 48 | new MFLogisticPotential(sampledRow, cols(c), 0.0), 49 | new L2Regularization(0.01, sampledRow, cols(c)) 50 | )) with DifferentiableSum 51 | 52 | 53 | //new MFLogisticPotential(sampledRow, cols(c), 0.0) 54 | } 55 | }) 56 | 57 | val problem = Problem(potentials ++ stochasticPotentials) 58 | 59 | val optimizer = new GradientBasedOptimizer(problem) 60 | 61 | val result = optimizer.gradientBasedArgmax(new BatchTrainer(_, new AdaGrad(0.1), 100), init = initialState) 62 | 63 | 64 | print("\t") 65 | cols.foreach(c => print(c + "\t")) 66 | println() 67 | rows.foreach(r => { 68 | print(r.name + "\t") 69 | cols.foreach(c => print(sigmoid(result.state(r) dot result.state(c)) + "\t")) 70 | println() 71 | }) 72 | 73 | } 74 | 75 | 76 | 77 | class MFLogisticPotential(rowVar: => VectVar, colVar: => VectVar, target: Double = 1.0) 78 | extends StatelessDifferentiable with StatelessScorer with VectPotential { 79 | 80 | override def vectVars: Array[VectVar] = Array(rowVar, colVar) 81 | 82 | private def innerLossAndDirection(s: Double): (Double, Int) = 83 | if (target >= s) (1 + s - target, 1) 84 | else (1 + target - s, -1) 85 | 86 | 87 | override def score(setting: Setting): Double = { 88 | val row = setting.vect(0) 89 | val col = setting.vect(1) 90 | 91 | val score = sigmoid(row dot col) 92 | 93 | val (loss, dir) = innerLossAndDirection(score) 94 | 95 | math.log(loss) 96 | } 97 | 98 | override def gradientAndValue(currentParameters: PartialSetting, gradient: Setting): Double = { 99 | val row = currentParameters.vect(0) 100 | val col = currentParameters.vect(1) 101 | 102 | val score = sigmoid(row dot col) 103 | 104 | val (loss, dir) = innerLossAndDirection(score) 105 | 106 | gradient.vect(0) = col * (1.0 - loss) * dir 107 | gradient.vect(1) = row * (1.0 - loss) * dir 108 | 109 | math.log(loss) 110 | } 111 | } 112 | 113 | /** 114 | * λ * Σ_i ||v_i||² 115 | */ 116 | class L2Regularization(lambda: Double, vars: VectVar*) extends StatelessDifferentiable with StatelessScorer with VectPotential { 117 | override def vectVars: Array[VectVar] = vars.toArray 118 | 119 | override def score(setting: Setting): Double = 120 | if (lambda == 0) 0 121 | else -lambda * setting.vect.map(v => v.twoNormSquared).sum 122 | 123 | override def gradientAndValue(currentParameters: PartialSetting, gradient: Setting): Double = { 124 | if (lambda != 0) 125 | (0 until vectVars.length).foreach(i => gradient.vect(i) = currentParameters.vect(i) * lambda * -2) 126 | 127 | score(currentParameters) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/hack/MTShowcase.scala: -------------------------------------------------------------------------------- 1 | package uclmr.hack 2 | 3 | import java.io.FileWriter 4 | 5 | import uclmr.Formula 6 | import uclmr.Impl 7 | import uclmr.ImplNeg 8 | import uclmr.TensorKB 9 | import uclmr.util.FormulaeExtractor._ 10 | import uclmr.{Formula, Impl, ImplNeg, TensorKB} 11 | import ml.wolfe.util.ProgressBar 12 | import ml.wolfe.util.ProgressBar 13 | 14 | 15 | /** 16 | * Finds strongest connections between portuguese shallow textual patterns and english dependency path patterns. 17 | * @author rockt 18 | */ 19 | object MTShowcase extends App { 20 | type Rule = Formula 21 | type Entity = Any 22 | type Relation = Any 23 | type SPDB = TensorKB 24 | 25 | def formulaScore(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = { 26 | val rows = pairs.filter(argFilter).map(_.head) 27 | (rows.map(e => rule(e)).sum / rows.size, rows.size) 28 | } 29 | 30 | /** 31 | * Calculates the weight of the formula based on matrix factorization predictions on observed premises. 32 | */ 33 | def formulaScoreMF(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = { 34 | val p1 = rule.predicates(0) 35 | val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) } 36 | 37 | //we only care about the score over true observed premises 38 | val filteredRows = rows.filter(argFilter).map(_.head).filter(e => { 39 | val cell = db.get(p1,e).get 40 | cell.train && cell.target == 1.0 41 | }) 42 | 43 | (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size) 44 | } 45 | 46 | /** 47 | * Calculates the weight of the formula based on matrix factorization predictions. 48 | */ 49 | def formulaScoreMFPredicted(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true, threshold: Double = 0.1, onlyUnobserved: Boolean = true)(implicit db: SPDB): (Double, Int) = { 50 | val p1 = rule.predicates(0) 51 | val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) } 52 | 53 | //we only care about the score over true predicted premises 54 | val filteredRows = rows.filter(argFilter).map(_.head).filter(e => db.prob(p1,e) >= threshold && (!onlyUnobserved || !db.get(p1, e).get.train)) 55 | 56 | if (filteredRows.isEmpty) (1.0, 0) else (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size) 57 | } 58 | 59 | /** 60 | * Calculates the weight of the formula based on the training data. 61 | */ 62 | def formulaScoreData(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = { 63 | val p1 = rule.predicates(0) 64 | val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) } 65 | 66 | //we only care about the score over true observed premises 67 | val filteredRows = rows.filter(argFilter).map(_.head).filter(e => { 68 | val cell = db.get(p1,e).get 69 | cell.train && cell.target == 1.0 70 | }) 71 | 72 | (filteredRows.map(e => rule match { 73 | case Impl(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 0.0) 0.0 else 1.0 74 | case ImplNeg(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 1.0) 0.0 else 1.0 75 | }).sum / filteredRows.size, filteredRows.size) 76 | } 77 | 78 | def implScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 79 | formulaScore(Impl(r1,r2), pairs) 80 | 81 | def implNegScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 82 | formulaScore(ImplNeg(r1,r2), pairs) 83 | 84 | def implScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 85 | formulaScoreMF(Impl(r1,r2), pairs) 86 | 87 | def implNegScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 88 | formulaScoreMF(ImplNeg(r1,r2), pairs) 89 | 90 | def implScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 91 | formulaScoreMF(Impl(r1,r2), pairs) 92 | 93 | def implNegScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 94 | formulaScoreMF(ImplNeg(r1,r2), pairs) 95 | 96 | def implScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 97 | formulaScoreData(Impl(r1,r2), pairs) 98 | 99 | def implNegScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 100 | formulaScoreData(ImplNeg(r1,r2), pairs) 101 | 102 | 103 | 104 | 105 | 106 | implicit val db = new SPDB 107 | 108 | println("Loading db...") 109 | db.deserialize(args.lift(0).getOrElse("wolfe-apps/data/out/bbc/serialized/")) 110 | println(db.toInfoString) 111 | 112 | val premises = db.relations.filter(_.toString.startsWith("por")) 113 | //val consequents = db.relations.filter(consequentFilter) 114 | val consequents = db.relations.filter(_.toString.startsWith("eng")) 115 | 116 | val rows = db.trainCells.map(_.key2).distinct.map(List(_)) 117 | //.map { case (ei, ej) => List(ei, ej) } 118 | 119 | println("Generating formulae...") 120 | val progressBar = new ProgressBar(consequents.size * premises.size, 1000) 121 | progressBar.start() 122 | 123 | val potentialRules = for { 124 | consequent <- consequents 125 | premise <- premises 126 | if premise != consequent 127 | } yield { 128 | val (scoreMF, numPremisesMF) = implScoreMF(premise, consequent, rows) 129 | val (scoreData, _) = implScoreData(premise, consequent, rows) 130 | progressBar.apply(consequent.toString) 131 | (scoreMF, scoreData, numPremisesMF, premise, consequent) 132 | } 133 | 134 | println() 135 | println("Writing formulae...") 136 | val ruleWriter = new FileWriter("wolfe-apps/data/formulae/mt.txt") 137 | potentialRules 138 | //.filter(_._2 >= 0.9) 139 | .filter(_._3 >= 10) 140 | .sortBy(-_._1) 141 | //.sortBy(-_._2) 142 | .take(100000) 143 | .zipWithIndex 144 | .foreach(z => { 145 | val (t, ix) = z 146 | ruleWriter.write("//%d\t%.2f\t%.2f\t%d\n".format(ix + 1, t._1, t._2, t._3)) 147 | ruleWriter.write(s"${t._4} => ${t._5}\n\n") 148 | }) 149 | ruleWriter.close() 150 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/io/FigerPB.scala: -------------------------------------------------------------------------------- 1 | package uclmr.io 2 | 3 | import com.google.protobuf.CodedOutputStream._ 4 | import com.google.protobuf.ExtensionRegistryLite._ 5 | 6 | /** 7 | * Created by sameer on 11/16/14. 8 | */ 9 | object FigerPB { 10 | // Generated by ScalaBuff, the Scala Protocol Buffers compiler. DO NOT EDIT! 11 | // source: entity.proto 12 | 13 | final case class Mention ( 14 | `start`: Option[Int] = None, 15 | `end`: Option[Int] = None, 16 | `tokens`: collection.immutable.Seq[String] = Vector.empty[String], 17 | `posTags`: collection.immutable.Seq[String] = Vector.empty[String], 18 | `deps`: collection.immutable.Seq[Mention.Dependency] = Vector.empty[Mention.Dependency], 19 | `entityName`: Option[String] = None, 20 | `features`: collection.immutable.Seq[String] = Vector.empty[String], 21 | `labels`: collection.immutable.Seq[String] = Vector.empty[String], 22 | `sentid`: Option[Int] = None, 23 | `fileid`: Option[String] = None 24 | ) extends com.google.protobuf.GeneratedMessageLite 25 | with com.google.protobuf.MessageLite.Builder 26 | with net.sandrogrzicic.scalabuff.Message[Mention] 27 | with net.sandrogrzicic.scalabuff.Parser[Mention] { 28 | 29 | def setStart(_f: Int) = copy(`start` = Some(_f)) 30 | def setEnd(_f: Int) = copy(`end` = Some(_f)) 31 | def setTokens(_i: Int, _v: String) = copy(`tokens` = `tokens`.updated(_i, _v)) 32 | def addTokens(_f: String) = copy(`tokens` = `tokens` :+ _f) 33 | def addAllTokens(_f: String*) = copy(`tokens` = `tokens` ++ _f) 34 | def addAllTokens(_f: TraversableOnce[String]) = copy(`tokens` = `tokens` ++ _f) 35 | def setPosTags(_i: Int, _v: String) = copy(`posTags` = `posTags`.updated(_i, _v)) 36 | def addPosTags(_f: String) = copy(`posTags` = `posTags` :+ _f) 37 | def addAllPosTags(_f: String*) = copy(`posTags` = `posTags` ++ _f) 38 | def addAllPosTags(_f: TraversableOnce[String]) = copy(`posTags` = `posTags` ++ _f) 39 | def setDeps(_i: Int, _v: Mention.Dependency) = copy(`deps` = `deps`.updated(_i, _v)) 40 | def addDeps(_f: Mention.Dependency) = copy(`deps` = `deps` :+ _f) 41 | def addAllDeps(_f: Mention.Dependency*) = copy(`deps` = `deps` ++ _f) 42 | def addAllDeps(_f: TraversableOnce[Mention.Dependency]) = copy(`deps` = `deps` ++ _f) 43 | def setEntityName(_f: String) = copy(`entityName` = Some(_f)) 44 | def setFeatures(_i: Int, _v: String) = copy(`features` = `features`.updated(_i, _v)) 45 | def addFeatures(_f: String) = copy(`features` = `features` :+ _f) 46 | def addAllFeatures(_f: String*) = copy(`features` = `features` ++ _f) 47 | def addAllFeatures(_f: TraversableOnce[String]) = copy(`features` = `features` ++ _f) 48 | def setLabels(_i: Int, _v: String) = copy(`labels` = `labels`.updated(_i, _v)) 49 | def addLabels(_f: String) = copy(`labels` = `labels` :+ _f) 50 | def addAllLabels(_f: String*) = copy(`labels` = `labels` ++ _f) 51 | def addAllLabels(_f: TraversableOnce[String]) = copy(`labels` = `labels` ++ _f) 52 | def setSentid(_f: Int) = copy(`sentid` = Some(_f)) 53 | def setFileid(_f: String) = copy(`fileid` = Some(_f)) 54 | 55 | def clearStart = copy(`start` = None) 56 | def clearEnd = copy(`end` = None) 57 | def clearTokens = copy(`tokens` = Vector.empty[String]) 58 | def clearPosTags = copy(`posTags` = Vector.empty[String]) 59 | def clearDeps = copy(`deps` = Vector.empty[Mention.Dependency]) 60 | def clearEntityName = copy(`entityName` = None) 61 | def clearFeatures = copy(`features` = Vector.empty[String]) 62 | def clearLabels = copy(`labels` = Vector.empty[String]) 63 | def clearSentid = copy(`sentid` = None) 64 | def clearFileid = copy(`fileid` = None) 65 | 66 | def writeTo(output: com.google.protobuf.CodedOutputStream) { 67 | if (`start`.isDefined) output.writeInt32(1, `start`.get) 68 | if (`end`.isDefined) output.writeInt32(2, `end`.get) 69 | for (_v <- `tokens`) output.writeString(3, _v) 70 | for (_v <- `posTags`) output.writeString(4, _v) 71 | for (_v <- `deps`) output.writeMessage(5, _v) 72 | if (`entityName`.isDefined) output.writeString(6, `entityName`.get) 73 | for (_v <- `features`) output.writeString(7, _v) 74 | for (_v <- `labels`) output.writeString(8, _v) 75 | if (`sentid`.isDefined) output.writeInt32(9, `sentid`.get) 76 | if (`fileid`.isDefined) output.writeString(10, `fileid`.get) 77 | } 78 | 79 | lazy val getSerializedSize = { 80 | import com.google.protobuf.CodedOutputStream._ 81 | var __size = 0 82 | if (`start`.isDefined) __size += computeInt32Size(1, `start`.get) 83 | if (`end`.isDefined) __size += computeInt32Size(2, `end`.get) 84 | for (_v <- `tokens`) __size += computeStringSize(3, _v) 85 | for (_v <- `posTags`) __size += computeStringSize(4, _v) 86 | for (_v <- `deps`) __size += computeMessageSize(5, _v) 87 | if (`entityName`.isDefined) __size += computeStringSize(6, `entityName`.get) 88 | for (_v <- `features`) __size += computeStringSize(7, _v) 89 | for (_v <- `labels`) __size += computeStringSize(8, _v) 90 | if (`sentid`.isDefined) __size += computeInt32Size(9, `sentid`.get) 91 | if (`fileid`.isDefined) __size += computeStringSize(10, `fileid`.get) 92 | 93 | __size 94 | } 95 | 96 | def mergeFrom(in: com.google.protobuf.CodedInputStream, extensionRegistry: com.google.protobuf.ExtensionRegistryLite): Mention = { 97 | import com.google.protobuf.ExtensionRegistryLite.{getEmptyRegistry => _emptyRegistry} 98 | var __start: Option[Int] = `start` 99 | var __end: Option[Int] = `end` 100 | val __tokens: collection.mutable.Buffer[String] = `tokens`.toBuffer 101 | val __posTags: collection.mutable.Buffer[String] = `posTags`.toBuffer 102 | val __deps: collection.mutable.Buffer[Mention.Dependency] = `deps`.toBuffer 103 | var __entityName: Option[String] = `entityName` 104 | val __features: collection.mutable.Buffer[String] = `features`.toBuffer 105 | val __labels: collection.mutable.Buffer[String] = `labels`.toBuffer 106 | var __sentid: Option[Int] = `sentid` 107 | var __fileid: Option[String] = `fileid` 108 | 109 | def __newMerged = Mention( 110 | __start, 111 | __end, 112 | Vector(__tokens: _*), 113 | Vector(__posTags: _*), 114 | Vector(__deps: _*), 115 | __entityName, 116 | Vector(__features: _*), 117 | Vector(__labels: _*), 118 | __sentid, 119 | __fileid 120 | ) 121 | while (true) in.readTag match { 122 | case 0 => return __newMerged 123 | case 8 => __start = Some(in.readInt32()) 124 | case 16 => __end = Some(in.readInt32()) 125 | case 26 => __tokens += in.readString() 126 | case 34 => __posTags += in.readString() 127 | case 42 => __deps += readMessage[Mention.Dependency](in, Mention.Dependency.defaultInstance, _emptyRegistry) 128 | case 50 => __entityName = Some(in.readString()) 129 | case 58 => __features += in.readString() 130 | case 66 => __labels += in.readString() 131 | case 72 => __sentid = Some(in.readInt32()) 132 | case 82 => __fileid = Some(in.readString()) 133 | case default => if (!in.skipField(default)) return __newMerged 134 | } 135 | null 136 | } 137 | 138 | def mergeFrom(m: Mention) = { 139 | Mention( 140 | m.`start`.orElse(`start`), 141 | m.`end`.orElse(`end`), 142 | `tokens` ++ m.`tokens`, 143 | `posTags` ++ m.`posTags`, 144 | `deps` ++ m.`deps`, 145 | m.`entityName`.orElse(`entityName`), 146 | `features` ++ m.`features`, 147 | `labels` ++ m.`labels`, 148 | m.`sentid`.orElse(`sentid`), 149 | m.`fileid`.orElse(`fileid`) 150 | ) 151 | } 152 | 153 | def getDefaultInstanceForType = Mention.defaultInstance 154 | def clear = getDefaultInstanceForType 155 | def isInitialized = true 156 | def build = this 157 | def buildPartial = this 158 | def parsePartialFrom(cis: com.google.protobuf.CodedInputStream, er: com.google.protobuf.ExtensionRegistryLite) = mergeFrom(cis, er) 159 | override def getParserForType = this 160 | def newBuilderForType = getDefaultInstanceForType 161 | def toBuilder = this 162 | def toJson(indent: Int = 0): String = "ScalaBuff JSON generation not enabled. Use --generate_json_method to enable." 163 | } 164 | 165 | object Mention { 166 | @reflect.BeanProperty val defaultInstance = new Mention() 167 | 168 | def parseFrom(data: Array[Byte]): Mention = defaultInstance.mergeFrom(data) 169 | def parseFrom(data: Array[Byte], offset: Int, length: Int): Mention = defaultInstance.mergeFrom(data, offset, length) 170 | def parseFrom(byteString: com.google.protobuf.ByteString): Mention = defaultInstance.mergeFrom(byteString) 171 | def parseFrom(stream: java.io.InputStream): Mention = defaultInstance.mergeFrom(stream) 172 | def parseDelimitedFrom(stream: java.io.InputStream): Option[Mention] = defaultInstance.mergeDelimitedFromStream(stream) 173 | 174 | val START_FIELD_NUMBER = 1 175 | val END_FIELD_NUMBER = 2 176 | val TOKENS_FIELD_NUMBER = 3 177 | val POS_TAGS_FIELD_NUMBER = 4 178 | val DEPS_FIELD_NUMBER = 5 179 | val ENTITY_NAME_FIELD_NUMBER = 6 180 | val FEATURES_FIELD_NUMBER = 7 181 | val LABELS_FIELD_NUMBER = 8 182 | val SENTID_FIELD_NUMBER = 9 183 | val FILEID_FIELD_NUMBER = 10 184 | 185 | def newBuilder = defaultInstance.newBuilderForType 186 | def newBuilder(prototype: Mention) = defaultInstance.mergeFrom(prototype) 187 | 188 | final case class Dependency ( 189 | `type`: Option[String] = None, 190 | `gov`: Option[Int] = None, 191 | `dep`: Option[Int] = None 192 | ) extends com.google.protobuf.GeneratedMessageLite 193 | with com.google.protobuf.MessageLite.Builder 194 | with net.sandrogrzicic.scalabuff.Message[Dependency] 195 | with net.sandrogrzicic.scalabuff.Parser[Dependency] { 196 | 197 | def setType(_f: String) = copy(`type` = Some(_f)) 198 | def setGov(_f: Int) = copy(`gov` = Some(_f)) 199 | def setDep(_f: Int) = copy(`dep` = Some(_f)) 200 | 201 | def clearType = copy(`type` = None) 202 | def clearGov = copy(`gov` = None) 203 | def clearDep = copy(`dep` = None) 204 | 205 | def writeTo(output: com.google.protobuf.CodedOutputStream) { 206 | if (`type`.isDefined) output.writeString(1, `type`.get) 207 | if (`gov`.isDefined) output.writeInt32(2, `gov`.get) 208 | if (`dep`.isDefined) output.writeInt32(3, `dep`.get) 209 | } 210 | 211 | lazy val getSerializedSize = { 212 | import com.google.protobuf.CodedOutputStream._ 213 | var __size = 0 214 | if (`type`.isDefined) __size += computeStringSize(1, `type`.get) 215 | if (`gov`.isDefined) __size += computeInt32Size(2, `gov`.get) 216 | if (`dep`.isDefined) __size += computeInt32Size(3, `dep`.get) 217 | 218 | __size 219 | } 220 | 221 | def mergeFrom(in: com.google.protobuf.CodedInputStream, extensionRegistry: com.google.protobuf.ExtensionRegistryLite): Dependency = { 222 | import com.google.protobuf.ExtensionRegistryLite.{getEmptyRegistry => _emptyRegistry} 223 | var __type: Option[String] = `type` 224 | var __gov: Option[Int] = `gov` 225 | var __dep: Option[Int] = `dep` 226 | 227 | def __newMerged = Dependency( 228 | __type, 229 | __gov, 230 | __dep 231 | ) 232 | while (true) in.readTag match { 233 | case 0 => return __newMerged 234 | case 10 => __type = Some(in.readString()) 235 | case 16 => __gov = Some(in.readInt32()) 236 | case 24 => __dep = Some(in.readInt32()) 237 | case default => if (!in.skipField(default)) return __newMerged 238 | } 239 | null 240 | } 241 | 242 | def mergeFrom(m: Dependency) = { 243 | Dependency( 244 | m.`type`.orElse(`type`), 245 | m.`gov`.orElse(`gov`), 246 | m.`dep`.orElse(`dep`) 247 | ) 248 | } 249 | 250 | def getDefaultInstanceForType = Dependency.defaultInstance 251 | def clear = getDefaultInstanceForType 252 | def isInitialized = true 253 | def build = this 254 | def buildPartial = this 255 | def parsePartialFrom(cis: com.google.protobuf.CodedInputStream, er: com.google.protobuf.ExtensionRegistryLite) = mergeFrom(cis, er) 256 | override def getParserForType = this 257 | def newBuilderForType = getDefaultInstanceForType 258 | def toBuilder = this 259 | def toJson(indent: Int = 0): String = "ScalaBuff JSON generation not enabled. Use --generate_json_method to enable." 260 | } 261 | 262 | object Dependency { 263 | @reflect.BeanProperty val defaultInstance = new Dependency() 264 | 265 | def parseFrom(data: Array[Byte]): Dependency = defaultInstance.mergeFrom(data) 266 | def parseFrom(data: Array[Byte], offset: Int, length: Int): Dependency = defaultInstance.mergeFrom(data, offset, length) 267 | def parseFrom(byteString: com.google.protobuf.ByteString): Dependency = defaultInstance.mergeFrom(byteString) 268 | def parseFrom(stream: java.io.InputStream): Dependency = defaultInstance.mergeFrom(stream) 269 | def parseDelimitedFrom(stream: java.io.InputStream): Option[Dependency] = defaultInstance.mergeDelimitedFromStream(stream) 270 | 271 | val TYPE_FIELD_NUMBER = 1 272 | val GOV_FIELD_NUMBER = 2 273 | val DEP_FIELD_NUMBER = 3 274 | 275 | def newBuilder = defaultInstance.newBuilderForType 276 | def newBuilder(prototype: Dependency) = defaultInstance.mergeFrom(prototype) 277 | 278 | } 279 | } 280 | 281 | object EntityProtos { 282 | def registerAllExtensions(registry: com.google.protobuf.ExtensionRegistryLite) { 283 | } 284 | 285 | } 286 | 287 | } 288 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/io/MatrixFilter.scala: -------------------------------------------------------------------------------- 1 | package uclmr.io 2 | 3 | import java.io.FileWriter 4 | 5 | import uclmr.TensorKB 6 | import ml.wolfe.util.Conf 7 | 8 | /** 9 | * Reads in a huge sparse matrix and filters it. 10 | * Can also be used to add more data to the matrix (e.g. freebase facts). 11 | * 12 | * args0: input matrix 13 | * args1: output matrix 14 | * args2-: additional freebase relations 15 | * @author rockt 16 | */ 17 | object MatrixFilter extends App { 18 | val filePath = args.lift(0).getOrElse("./data/bbc/matrix_multi_all.txt") 19 | println("Loading...") 20 | val kb = LoadTSV(filePath = filePath) 21 | println(kb.toInfoString) 22 | 23 | val matricesToAdd = if (args.size > 2) args.tail.tail else Array( 24 | "./data/bbc/matrix_freebase.txt" 25 | ) 26 | println("Loading additional data...") 27 | matricesToAdd.foreach(fileName => LoadTSV(db = kb, filePath = fileName)) 28 | println(kb.toInfoString) 29 | 30 | println("Filtering...") 31 | val filteredKB = new TensorKB() 32 | 33 | val frequentRows = kb.keys2.filter(key2 => kb.getBy2(key2).size > 10).toSet 34 | val frequentCols = kb.keys1.filter(key1 => kb.getBy1(key1).size > 25).toSet 35 | 36 | val filteredCells = 37 | kb.cells.filter(c => frequentCols(c.key1) && frequentRows(c.key2)) 38 | .foreach(cell => filteredKB += cell) 39 | 40 | println(filteredKB.toInfoString) 41 | 42 | val fileWriter = new FileWriter(args.lift(1).getOrElse("./data/bbc/matrix_final.txt")) 43 | filteredKB.cells.foreach(cell => { 44 | val (e1, e2) = cell.key2 45 | fileWriter.write(s"${cell.key1}\t$e1\t$e2\t${cell.cellType}\t${cell.target}\n") 46 | }) 47 | fileWriter.close() 48 | } 49 | 50 | /** 51 | * Shows stats about the matrix, e.g., what freebase relations are in there. 52 | */ 53 | object MatrixInspector extends App { 54 | val kb = LoadTSV(filePath = args.lift(0).getOrElse("./data/bbc/matrix_final.txt")) 55 | println(kb.toInfoString) 56 | 57 | val freebaseRelations = kb.keys1.filter(_.toString.startsWith("REL$")) 58 | freebaseRelations.foreach(println) 59 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/io/TSV.scala: -------------------------------------------------------------------------------- 1 | package uclmr.io 2 | 3 | import uclmr.{DefaultIx, Cell, CellType, TensorKB} 4 | import ml.wolfe.util.{ProgressBar, Conf} 5 | 6 | import scala.io.Source 7 | import scala.util.Random 8 | 9 | /** 10 | * @author rockt 11 | */ 12 | object LoadTSV extends App { 13 | def apply(k: Int = 100, subsample: Double = 1.0, db: TensorKB = null, filePath: String = Conf.getString("inputFile")): TensorKB = { 14 | val kb = if (db != null) db else new TensorKB(k) 15 | val rand = new Random(0l) 16 | 17 | val lines = Source.fromFile(filePath).getLines() 18 | 19 | val progressBar = new ProgressBar(Source.fromFile(filePath).getLines().size, 100000) 20 | progressBar.start() 21 | 22 | for { 23 | fact <- lines 24 | Array(r, e1, e2, typ, target) = fact.split("\t") 25 | } { 26 | val cellType = typ match { 27 | case "Train" => CellType.Train 28 | case "Test" => CellType.Test 29 | case "Dev" => CellType.Dev 30 | case "Observed" => CellType.Observed 31 | } 32 | 33 | if (rand.nextDouble() < subsample) { 34 | val cell = Cell(r, (e1, e2), DefaultIx, target.toDouble, cellType) 35 | kb += cell 36 | } 37 | 38 | progressBar(r) 39 | } 40 | 41 | kb 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/ArgMaxSigmoid.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import cc.factorie.la.DenseTensor1 4 | import cc.factorie.optimize.{AdaGrad, OnlineTrainer, BatchTrainer} 5 | import ml.wolfe.fg.{L2Regularization, CellLogisticLoss, VectorMsgs} 6 | import ml.wolfe.util.ProgressLogging 7 | import ml.wolfe.{GradientBasedOptimizer, FactorieVector} 8 | import uclmr.{Cell, TensorDB} 9 | 10 | import scala.util.Random 11 | 12 | /** 13 | * @author rockt 14 | */ 15 | object ArgMaxSigmoid extends App { 16 | /** 17 | * Probably the most expensive way to find the argmax of the sigmoid of a dot product in history of optimization. 18 | * Anyway, finds the the vector that maximizes the sigmoid of the dot product of a given vector and target value. 19 | * @param vec a given vector 20 | * @param target target value 21 | * @return argmax_vec* σ(vec • vec*) = target 22 | */ 23 | def apply(vec: FactorieVector, target: Double = 1.0, lambda: Double = 0.01): FactorieVector = { 24 | val db = new TensorDB(vec.length) 25 | 26 | db += Cell("vec1", "vec2") 27 | 28 | val fg = db.toFactorGraph 29 | 30 | val vec1Node = db.node1("vec1").get 31 | val vec2Node = db.node2("vec2").get 32 | 33 | fg.buildFactor(Seq(vec2Node, vec1Node))(_ map (_ => new VectorMsgs)) { 34 | e => new CellLogisticLoss(e(0), e(1), target, lambda, 1.0, false) with L2Regularization 35 | } 36 | 37 | fg.build() 38 | 39 | vec1Node.variable.asVector.b = vec 40 | 41 | GradientBasedOptimizer(fg, new OnlineTrainer(_, new AdaGrad(rate = 1.0), 1000, 1) with ProgressLogging) 42 | 43 | vec2Node.variable.asVector.b 44 | } 45 | 46 | val rand = new Random(0l) 47 | 48 | val col = new DenseTensor1((0 until 100).map(i => rand.nextGaussian() * 0.1).toArray) 49 | 50 | println(col) 51 | 52 | val row = ArgMaxSigmoid(col) 53 | 54 | 55 | def sig(x: Double) = 1.0 / (1.0 + math.exp(-x)) 56 | 57 | println("vec1: " + col.mkString("\t")) 58 | println("vec2: " + row.mkString("\t")) 59 | println("σ(vec1 • vec2): " + sig(col dot row)) 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/DataInspector.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import java.io.FileWriter 4 | 5 | import uclmr.TensorDB 6 | import uclmr.io.LoadNAACL 7 | import ml.wolfe.util.Conf 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * @author rockt 13 | */ 14 | object DataInspector extends App { 15 | Conf.add(args.lift(0).getOrElse("./conf/mf.conf")) 16 | 17 | /* 18 | val db = LoadNAACL() 19 | println(db.toInfoString) 20 | 21 | println(db.trainCells.count(_.key1.toString.startsWith("REL$"))) 22 | 23 | Mentions.load() 24 | val pathToMentionsMap = Mentions.pathToMentions 25 | 26 | val paths = Seq( 27 | "path#nn|<-nn<-unit->prep->of->pobj->|pobj:INV", 28 | "path#appos|->appos->producer->dep->|dep:INV", 29 | "path#nsubj|<-nsubj<-city->prep->in->pobj->|pobj", 30 | "path#pobj|<-pobj<-to<-prep<-move->prep->to->pobj->|pobj:INV" 31 | ) 32 | 33 | paths.foreach(p => { 34 | println(p) 35 | println(pathToMentionsMap(p).mkString("\n")) 36 | println() 37 | }) 38 | */ 39 | 40 | val lengthsWriter = new FileWriter("./data/eval/lengths.txt") 41 | 42 | val formulaePredicates = { 43 | val db2 = LoadNAACL() 44 | db2.formulae.map(f => f.predicates(0) -> f.predicates(1)) 45 | } 46 | 47 | def writeLenghts(pathToDB: String, sample: Boolean, label: String) { 48 | val db = new TensorDB(100) 49 | 50 | db.deserialize(pathToDB) 51 | 52 | val rand = new Random(0l) 53 | 54 | val numSamples = 1000 55 | 56 | val pairs = 57 | if (sample) 58 | for (i <- 0 until numSamples) yield { 59 | val premise = db.keys1(rand.nextInt(db.keys1.size)) 60 | val consequent = db.keys1(rand.nextInt(db.keys1.size)) 61 | (premise, consequent) 62 | } 63 | else 64 | formulaePredicates 65 | 66 | //println(pairs.size) 67 | 68 | def key1ToLength(key1: Any): Double = db.node1(key1).get.variable.asVector.b.twoNorm 69 | 70 | 71 | pairs.foreach(p => { 72 | val (premise, consequent) = p 73 | lengthsWriter.write((key1ToLength(consequent) - key1ToLength(premise)).toString + "\t" + label + "\n") 74 | }) 75 | } 76 | 77 | writeLenghts("data/out/F/serialized/", true, "mf-sample") 78 | writeLenghts("data/out/F/serialized/", false, "mf-formulae") 79 | writeLenghts("data/out/F-Pre/serialized/", false, "pre-formulae") 80 | writeLenghts("data/out/F-formulae-100/serialized/", false, "joint-formulae") 81 | 82 | lengthsWriter.close() 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/FormulaeAnnotator.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import java.io.FileWriter 4 | 5 | import ml.wolfe.util.ANSIFormatter._ 6 | import ml.wolfe.util.ProgressBar 7 | 8 | import scala.collection.mutable.ArrayBuffer 9 | import scala.io.Source 10 | import scala.util.Random 11 | 12 | 13 | /** 14 | * @author rockt 15 | */ 16 | 17 | object Mentions { 18 | val pathToMentions = new collection.mutable.HashMap[String, ArrayBuffer[String]] 19 | 20 | def load(filePath: String = "./data/naacl2013/nyt-freebase.test.mentions.txt"): Unit = { 21 | println("Loading sentences for dependency paths...") 22 | 23 | val progressBar = new ProgressBar(Source.fromFile(filePath, "iso-8859-1").getLines().size, 100000) 24 | progressBar.start() 25 | 26 | val lines = Source.fromFile(filePath, "iso-8859-1").getLines() 27 | lines.foreach(line => { 28 | if (!line.isEmpty && !line.startsWith("#Document")) { 29 | val splits = line.split("\t") 30 | val label = splits(0) 31 | val arg1 = splits(1) 32 | val arg2 = splits(2) 33 | val typ = splits(3) 34 | 35 | val path = splits.find(_.startsWith("path#")).get 36 | 37 | var sentence = splits.find(_.startsWith("sen#")).get.drop(4) 38 | .replaceAllLiterally(arg1, arg1.onBlue()) 39 | .replaceAllLiterally(arg2, arg2.onRed()) 40 | 41 | pathToMentions.getOrElseUpdate(path, new ArrayBuffer[String]()) += sentence 42 | } 43 | progressBar() 44 | }) 45 | } 46 | } 47 | 48 | object Action extends Enumeration { 49 | type Answer = Value 50 | val No, Yes, Unsure, Opposite, More, Quit, Undefined = Value 51 | } 52 | 53 | object FormulaeAnnotator extends App { 54 | import uclmr.util.Action._ 55 | 56 | val filePath = args.lift(0).getOrElse("./data/formulae/1000.txt") 57 | val reannotate = args.lift(1).getOrElse("false").toBoolean 58 | val skipUntil = args.lift(2).getOrElse("0").toInt 59 | val rhsFilter: String => Boolean = if (args.size > 3) s => !s.endsWith(args(3)) else s => true 60 | 61 | println(s"Creating backup at $filePath.old...") 62 | val backup = new FileWriter(filePath + ".old") 63 | backup.write(Source.fromFile(filePath, "iso-8859-1").getLines().mkString("\n")) 64 | backup.close() 65 | 66 | val rand = new Random(0l) 67 | val fileWriter = new FileWriter(filePath + ".tmp") 68 | 69 | Mentions.load() 70 | val text = Source.fromFile(filePath, "iso-8859-1").getLines().mkString("\n") 71 | val formulae = text.split("\n\n") 72 | val newFile = new FileWriter(filePath) 73 | var quit = false 74 | 75 | formulae.foreach(formulaDatum => { 76 | val Array(statsTmp, formula) = formulaDatum.split("\n") 77 | 78 | val isCurated = statsTmp.endsWith("curated") 79 | val stats = if (isCurated) statsTmp.dropRight(8) else statsTmp 80 | val ix = stats.drop(2).split("\t")(0).toInt 81 | 82 | 83 | if (quit || (!reannotate && isCurated) || ix < skipUntil || !rhsFilter(formula)) { 84 | //ignore already curated formulae or the rest of the formulae in case we quit annotation 85 | fileWriter.write(formulaDatum + "\n\n") 86 | } else { 87 | //otherwise start annotation 88 | //currently only supports implications 89 | 90 | val Array(lhsTmp, arrow, rhsTmp) = formula.split(" ") 91 | 92 | val isCommentedOut = lhsTmp.startsWith("//") 93 | val isNegated = rhsTmp.startsWith("!") 94 | 95 | val lhs = if (isCommentedOut) lhsTmp.drop(2) else lhsTmp 96 | val rhs = if (isNegated) rhsTmp.drop(1) else rhsTmp 97 | 98 | val sentences = Mentions.pathToMentions.get(lhs) 99 | .map(s => rand.shuffle(s)).getOrElse(List()) 100 | 101 | println() 102 | println(stats) 103 | 104 | var path = lhs.drop(5) 105 | 106 | val pathWords = path 107 | .split("\\|")(1) 108 | .split("<-|->") 109 | .zipWithIndex 110 | .filter(_._2 % 2 == 0) 111 | .map(_._1) 112 | .toList 113 | .filterNot(_.isEmpty) 114 | .map(_.trim) 115 | 116 | for (w <- pathWords) 117 | path = path.replaceAllLiterally(w, w.onYellow()) 118 | 119 | val rel = rhs.split("/").last 120 | 121 | println(path + " => " + rhs.replaceAllLiterally(rel, rel.onMagenta())) 122 | if (reannotate && isCurated) 123 | println("currently".onCyan() + ": " + 124 | (if (isCommentedOut) "//".onRed() else "") + 125 | "A" + " => " + (if (isNegated) "!".onRed() else "") + "B" 126 | ) 127 | 128 | 129 | var ix = 0 130 | var answer: Answer = Undefined 131 | 132 | while (answer == Undefined) { 133 | var examples = sentences.slice(ix, ix+10).toList 134 | for (w <- pathWords.filter(_.size > 2)) { 135 | examples = examples.map(s => s.replaceAllLiterally(w, w.yellow())) 136 | } 137 | 138 | examples.foreach(s => println(s"\t$s")) 139 | ix += 10 140 | 141 | print("\nAdd this rule [y/n/u], the opposite [o] or show more mentions [m]: \r") 142 | 143 | answer = stringToAnswer(readLine()) 144 | while (answer == Undefined) { 145 | print("Please answer with [y/n/o/u/m/quit]! \r") 146 | answer = stringToAnswer(readLine()) 147 | } 148 | 149 | answer match { 150 | case Yes => 151 | fileWriter.write(s"$stats\tcurated\n") 152 | fileWriter.write(s"$lhs => $rhs\n\n") 153 | case No => 154 | fileWriter.write(s"$stats\tcurated\n") 155 | fileWriter.write(s"//$lhs => $rhs\n\n") 156 | case Opposite => 157 | fileWriter.write(s"$stats\tcurated\n") 158 | fileWriter.write(s"$lhs => !$rhs\n\n") 159 | case Unsure => 160 | fileWriter.write(s"$stats\n") 161 | fileWriter.write(s"//$lhs => $rhs\n\n") 162 | case More => 163 | answer = Undefined 164 | case Quit => 165 | fileWriter.write(formulaDatum + "\n\n") 166 | quit = true 167 | } 168 | } 169 | } 170 | }) 171 | 172 | fileWriter.close() 173 | 174 | def stringToAnswer(string: String): Answer = string.toLowerCase.trim match { 175 | case "y" | "yes" => Yes 176 | case "n" | "no" => No 177 | case "o" | "opposite" => Opposite 178 | case "u" | "unsure" => Unsure 179 | case "m" | "more" => More 180 | case "quit" | "exit" => Quit 181 | case _ => Undefined 182 | } 183 | newFile.write(Source.fromFile(filePath + ".tmp", "iso-8859-1").getLines().mkString("\n")) 184 | newFile.close() 185 | } 186 | 187 | /* 188 | object CompareRanks extends App { 189 | import RuleFinder.loadDB 190 | 191 | val db1Path = if (args.size > 0) args(0) else "./out/vectorland-F/serialized/" 192 | val db2Path = if (args.size > 1) args(1) else "./out/latest/serialized/" //"./out/vectorland-F-rules-100/serialized/" 193 | 194 | val rulesFile = if (args.size > 2) args(2) else db2Path+"rules.txt" 195 | 196 | val implRhsToLhsMap = Source.fromFile(rulesFile, "iso-8859-1").getLines().toList.map(_.split(" => ")) 197 | .collect { case Array(lhs,rhs) if !lhs.startsWith("//") && !rhs.startsWith("!") => (lhs, rhs) } 198 | .groupBy(_._2).mapValues(l => l.map(_._1)) 199 | 200 | val implNegRhsToLhsMap = Source.fromFile(rulesFile, "iso-8859-1").getLines().toList.map(_.split(" => ")) 201 | .collect { case Array(lhs,rhs) if !lhs.startsWith("//") && rhs.startsWith("!") => (lhs, rhs.drop(1)) } 202 | .groupBy(_._2).mapValues(l => l.map(_._1)) 203 | 204 | //println("A => B") 205 | //println(implRhsToLhsMap.mkString("\n")) 206 | 207 | //println("A => !B") 208 | //println(implNegRhsToLhsMap.mkString("\n")) 209 | 210 | val db1 = loadDB(db1Path) 211 | val db2 = loadDB(db2Path) 212 | 213 | def printChanges(formulaMap: Map[String, List[String]]) = { 214 | val changeInImplRanks = toChangeInRanksAndP(formulaMap) 215 | val changeInImplFormulae = toChangeInFormulaeSimAndScore(formulaMap) 216 | for { 217 | key <- changeInImplRanks.keys 218 | (rankChange, rankImproved, pChange) = changeInImplRanks(key) 219 | (simChange, scoreChange, scoreObsChange) = changeInImplFormulae(key) 220 | } println(key + "\n\trank: %8.2f\tups: %5.2f\tp: %5.2f\tscore: %5.2f\tsim: %5.2f" 221 | .format(rankChange, rankImproved, pChange, scoreChange, simChange) + "\n") 222 | } 223 | 224 | def toChangeInRanksAndP(formulaMap: Map[String, List[String]]): Map[String, (Double, Double, Double)] = { 225 | formulaMap.toList.map(t => { 226 | val (key, values) = t 227 | 228 | val db1ImplFacts = sortByRHS(db1, key, formulaMap) 229 | val db2ImplFacts = sortByRHS(db2, key, formulaMap) 230 | 231 | val compared = compare(db1ImplFacts, db2ImplFacts).take(1000) 232 | 233 | //println(compared.mkString("\n")) 234 | 235 | val rankChange = compared.values.map(_._1).sum / compared.size.toDouble 236 | val pChange = compared.values.map(_._2).sum / compared.size 237 | val rankImproved = compared.values.map(_._1).count(_ > 0) / compared.size.toDouble 238 | 239 | //println(s"\tavg rank change: ${if (rankChange > 0) "+" + rankChange else rankChange}") 240 | //println(s"\tavg p change: $pChange") 241 | key -> (rankChange, rankImproved, pChange) 242 | }).toMap 243 | } 244 | 245 | def sortByRHS(db: SPDB, relation: String, rhsToLhsMap: Map[String, List[String]]): Map[Fact, (Int, Double)] = 246 | getTopKFactsPerRel(db, relation).zipWithIndex 247 | .filter { case (f, ix) => existsPremise(db1, f, rhsToLhsMap) } 248 | .map { case (f, ix) => f -> (ix, db.prob(f)) } 249 | .toMap 250 | 251 | def getTopKFactsPerRel(db: SPDB, relation: String, k: Int = Int.MaxValue): Seq[Fact] = 252 | db.facts(db.relation(relation).get).sortBy(f => -db.prob(f)).take(k) 253 | 254 | def existsPremise(db: SPDB, fact: Fact, rhsToLhsMap: Map[String, List[String]]): Boolean = 255 | rhsToLhsMap(fact.relation.name).exists(lhs => db.fact(fact.args, db.relation(lhs).get).isDefined) 256 | 257 | def compare(map1: Map[Fact, (Int, Double)], map2: Map[Fact, (Int, Double)]): Map[Fact, (Int, Double)] = 258 | (map1.keySet intersect map2.keySet).map(k => { 259 | val (ix1, p1) = map1(k) 260 | val (ix2, p2) = map2(k) 261 | k -> (ix1 - ix2, p2 - p1) 262 | }).toMap 263 | 264 | import RuleFinder.implScore 265 | import RuleFinder.implNegScore 266 | import RuleFinder.implScoreTruePremise 267 | import RuleFinder.implScoreTrain 268 | 269 | def toChangeInFormulaeSimAndScore(formulaMap: Map[String, List[String]]): Map[String, (Double, Double, Double)] = { 270 | formulaMap.toList.map(t => { 271 | val (key, values) = t 272 | var scoreChange = 0.0 273 | var scoreObsChange = 0.0 274 | var simChange = 0.0 275 | for { 276 | lhsName <- values 277 | rhsName = key 278 | rhsDb1 = db1.relation(rhsName).get 279 | lhsDb1 = db1.relation(lhsName).get 280 | rhsDb2 = db2.relation(rhsName).get 281 | lhsDb2 = db2.relation(lhsName).get 282 | lhsEmbeddingDb1 = lhsDb1.embedding(db1) 283 | rhsEmbeddingDb1 = rhsDb1.embedding(db1) 284 | lhsEmbeddingDb2 = lhsDb1.embedding(db2) 285 | rhsEmbeddingDb2 = rhsDb1.embedding(db2) 286 | db1Sim = lhsEmbeddingDb1 cosineSimilarity rhsEmbeddingDb1 287 | db2Sim = lhsEmbeddingDb2 cosineSimilarity rhsEmbeddingDb2 288 | } { 289 | val scoreDb1 = implScore(lhsDb1, rhsDb1, db1.trainFacts.map(_.args))(db1)._1 290 | val scoreDb2 = implScore(lhsDb2, rhsDb2, db2.trainFacts.map(_.args))(db2)._1 291 | 292 | val scoreObsDb1 = implScore(lhsDb1, rhsDb1, rowsWithTrueLhs(db1, lhsName))(db1)._1 293 | val scoreObsDb2 = implScore(lhsDb2, rhsDb2, rowsWithTrueLhs(db2, lhsName))(db2)._1 294 | 295 | simChange += (db2Sim - db1Sim) 296 | scoreChange += (scoreDb2 - scoreDb1) 297 | scoreObsChange += (scoreObsDb2 - scoreObsDb1) 298 | } 299 | key -> (simChange / values.size, scoreChange / values.size, scoreObsChange / values.size) 300 | }).toMap 301 | } 302 | 303 | def rowsWithTrueLhs(db: SPDB, lhs: String): Seq[List[Entity]] = 304 | db.facts(db.relation(lhs).get).filter(_.train).map(_.args) 305 | 306 | println("A => B") 307 | printChanges(implRhsToLhsMap) 308 | println("\nA => !B") 309 | printChanges(implNegRhsToLhsMap) 310 | } 311 | */ 312 | 313 | object FormulaeFilter extends App { 314 | val filePath = args.lift(0).getOrElse("data/formulae/10000.txt") 315 | val outputPath = args.lift(1).getOrElse("data/formulae/10000-filtered.txt") 316 | val fileWriter = new FileWriter(outputPath) 317 | 318 | val rules = Source.fromFile(filePath, "iso-8859-1").getLines().mkString("\n").split("\n\n") 319 | for { 320 | rule <- rules 321 | Array(stats, formula) = rule.split("\n") 322 | tmp = stats.drop(2).split("\t") 323 | Array(rank, mfHint, dataHint, numPremises) = tmp 324 | if !formula.startsWith("//") 325 | Array(premise, consequent) = formula.split(" => ") 326 | } { 327 | if (false && dataHint.toDouble > 0.75) 328 | fileWriter.write(rule + "\n\n") 329 | else if (dataHint.toDouble <= 0.01 && mfHint.toDouble >= 0.8) 330 | fileWriter.write(stats + "\n" + premise + " => !" + consequent + "\n\n") 331 | } 332 | 333 | fileWriter.close() 334 | } 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/FormulaeExtractor.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import java.io.FileWriter 4 | 5 | import uclmr.{Formula, Impl, ImplNeg, TensorKB} 6 | import ml.wolfe.util.ProgressBar 7 | 8 | 9 | /** 10 | * @author rockt 11 | */ 12 | object FormulaeExtractor extends App { 13 | type Rule = Formula 14 | type Entity = Any 15 | type Relation = Any 16 | type SPDB = TensorKB 17 | 18 | def formulaScore(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = { 19 | val rows = pairs.filter(argFilter).map(_.head) 20 | (rows.map(e => rule(e)).sum / rows.size, rows.size) 21 | } 22 | 23 | /** 24 | * Calculates the weight of the formula based on matrix factorization predictions on observed premises. 25 | */ 26 | def formulaScoreMF(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = { 27 | val p1 = rule.predicates(0) 28 | val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) } 29 | 30 | //we only care about the score over true observed premises 31 | val filteredRows = rows.filter(argFilter).map(_.head).filter(e => { 32 | val cell = db.get(p1,e).get 33 | cell.train && cell.target == 1.0 34 | }) 35 | 36 | (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size) 37 | } 38 | 39 | /** 40 | * Calculates the weight of the formula based on matrix factorization predictions. 41 | */ 42 | def formulaScoreMFPredicted(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true, threshold: Double = 0.1, onlyUnobserved: Boolean = true)(implicit db: SPDB): (Double, Int) = { 43 | val p1 = rule.predicates(0) 44 | val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) } 45 | 46 | //we only care about the score over true predicted premises 47 | val filteredRows = rows.filter(argFilter).map(_.head).filter(e => db.prob(p1,e) >= threshold && (!onlyUnobserved || !db.get(p1, e).get.train)) 48 | 49 | if (filteredRows.isEmpty) (1.0, 0) else (filteredRows.map(e => rule(e)).sum / filteredRows.size, filteredRows.size) 50 | } 51 | 52 | /** 53 | * Calculates the weight of the formula based on the training data. 54 | */ 55 | def formulaScoreData(rule: Rule, pairs: Seq[List[Entity]], argFilter: List[Entity] => Boolean = e => true)(implicit db: SPDB): (Double, Int) = { 56 | val p1 = rule.predicates(0) 57 | val rows = db.getBy1(p1).map { case (ei, ej) => List(ei) } 58 | 59 | //we only care about the score over true observed premises 60 | val filteredRows = rows.filter(argFilter).map(_.head).filter(e => { 61 | val cell = db.get(p1,e).get 62 | cell.train && cell.target == 1.0 63 | }) 64 | 65 | (filteredRows.map(e => rule match { 66 | case Impl(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 0.0) 0.0 else 1.0 67 | case ImplNeg(_, p2, _) => if (db.get(p1, e).get.target == 1.0 && db.get(p2, e).map(_.target).getOrElse(0.0) == 1.0) 0.0 else 1.0 68 | }).sum / filteredRows.size, filteredRows.size) 69 | } 70 | 71 | def implScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 72 | formulaScore(Impl(r1,r2), pairs) 73 | 74 | def implNegScore(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 75 | formulaScore(ImplNeg(r1,r2), pairs) 76 | 77 | def implScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 78 | formulaScoreMF(Impl(r1,r2), pairs) 79 | 80 | def implNegScoreMF(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 81 | formulaScoreMF(ImplNeg(r1,r2), pairs) 82 | 83 | def implScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 84 | formulaScoreMF(Impl(r1,r2), pairs) 85 | 86 | def implNegScoreMFPredicted(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 87 | formulaScoreMF(ImplNeg(r1,r2), pairs) 88 | 89 | def implScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 90 | formulaScoreData(Impl(r1,r2), pairs) 91 | 92 | def implNegScoreData(r1: Relation, r2: Relation, pairs: Seq[List[Entity]])(implicit db: SPDB): (Double, Int) = 93 | formulaScoreData(ImplNeg(r1,r2), pairs) 94 | 95 | 96 | 97 | lazy val testRelations = Seq( 98 | "person/company", 99 | "location/containedby", 100 | "person/nationality", 101 | "author/works_written", 102 | "parent/child", 103 | "person/place_of_birth", 104 | "person/place_of_death", 105 | "neighborhood/neighborhood_of", 106 | "person/parents", 107 | "company/founders", 108 | "sports_team/league", 109 | "team_owner/teams_owned", 110 | "team/arena_stadium", 111 | "film/directed_by", 112 | "roadcast/area_served", 113 | "structure/architect", 114 | "composer/compositions", 115 | "person/religion", 116 | "film/produced_by" 117 | ).toSet 118 | 119 | def consequentFilter(r: Relation) = testRelations.exists(s => r.asInstanceOf[String].contains(s)) 120 | 121 | 122 | implicit val db = new SPDB 123 | 124 | println("Loading db...") 125 | db.deserialize(args.lift(0).getOrElse("wolfe-apps/data/out/F/serialized/")) 126 | println(db.toInfoString) 127 | 128 | val premises = db.relations 129 | //val consequents = db.relations.filter(consequentFilter) 130 | val consequents = db.relations 131 | 132 | val rows = db.trainCells.map(_.key2).distinct.map(List(_)) 133 | //.map { case (ei, ej) => List(ei, ej) } 134 | 135 | println("Generating formulae...") 136 | val progressBar = new ProgressBar(consequents.size * premises.size, 1000) 137 | progressBar.start() 138 | 139 | val potentialRules = for { 140 | consequent <- consequents 141 | premise <- premises 142 | if premise != consequent 143 | } yield { 144 | val (scoreMF, numPremisesMF) = implScoreMF(premise, consequent, rows) 145 | val (scoreData, _) = implScoreData(premise, consequent, rows) 146 | progressBar.apply(consequent.toString) 147 | (scoreMF, scoreData, numPremisesMF, premise, consequent) 148 | } 149 | 150 | println() 151 | println("Writing formulae...") 152 | val ruleWriter = new FileWriter("wolfe-apps/data/formulae/latest.txt") 153 | potentialRules 154 | //.filter(_._2 >= 0.9) 155 | .filter(_._3 >= 10) 156 | .sortBy(-_._1) 157 | //.sortBy(-_._2) 158 | .take(100000) 159 | .zipWithIndex 160 | .foreach(z => { 161 | val (t, ix) = z 162 | ruleWriter.write("//%d\t%.2f\t%.2f\t%d\n".format(ix + 1, t._1, t._2, t._3)) 163 | ruleWriter.write(s"${t._4} => ${t._5}\n\n") 164 | }) 165 | ruleWriter.close() 166 | } -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/OptimiseMatrixFactorizationHyperParameters.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import ml.wolfe.util._ 4 | import uclmr.MatrixFactorization 5 | 6 | 7 | /** 8 | * Created by Ingolf on 06/11/2014. 9 | */ 10 | object OptimiseMatrixFactorizationHyperParameters extends App { 11 | val mfp = new MatrixFactorisationProblem() 12 | 13 | val myOptimizer: HyperParameterOptimisationAlgorithm = new NelderMeadSimplex() 14 | myOptimizer.optimise(mfp, Map[String, Double]("mf.lambda" -> 0.01, "mf.alpha" -> 0.1)) 15 | 16 | println("Best wMAP: " + myOptimizer.bestScore) 17 | println("Best parameters:\n" + myOptimizer.bestParameters) 18 | } 19 | 20 | class MatrixFactorisationProblem extends OptimisationProblem { 21 | override val parametersToOptimize: Seq[HyperParameter] = Seq(HyperParameter("mf.lambda"), HyperParameter("mf.alpha")) 22 | val startingValues = Map[String, Double]("mf.lambda" -> 0.01, "mf.alpha" -> 0.1) 23 | 24 | /** 25 | * Evaluate the optimisation problem given the set of hyper parameters. 26 | * @param hyperparameters The map of hyper parameters 27 | * @return The score of this evaluation, higher is better 28 | */ 29 | override def evaluate(hyperparameters: Map[String, Double]): Double = { 30 | val confPath = "conf/mf-debug.conf" 31 | val newConfPath = "conf/mf-hyper.conf" 32 | 33 | OverrideConfig(hyperparameters, newConfPath, confPath) 34 | 35 | val mf = new MatrixFactorization(newConfPath) 36 | 37 | -mf.run() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/Predictor.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import java.io.FileWriter 4 | 5 | import uclmr.hack.EntityHackNormalization 6 | import uclmr.{DefaultIx, TensorKB} 7 | 8 | /** 9 | * @author rockt 10 | */ 11 | object Predictor extends App { 12 | val pathToMatrix = args.lift(0).getOrElse("./data/out/bbc/serialized/") 13 | val outFile = args.lift(1).getOrElse("./data/out/bbc/predictions.txt") 14 | val relations = if (args.size > 2) args.tail else Array( 15 | "REL$/location/administrative_division/country", 16 | "REL$/base/biblioness/bibs_location/country", 17 | "REL$/location/location/contains", 18 | "REL$/people/person/nationality", 19 | "REL$/base/aareas/schema/administrative_area/administrative_parent", 20 | "REL$/location/country/first_level_divisions", 21 | "REL$/location/country/capital" 22 | ) 23 | 24 | println("Loading db...") 25 | val kb = new TensorKB(100) 26 | kb.deserialize(pathToMatrix) 27 | 28 | println(kb.toInfoString) 29 | 30 | println("Predicting facts...") 31 | val predictions = relations.map(rel => rel -> kb.keys2 32 | .filterNot(t => kb.getFact(rel, t, DefaultIx).exists(_.train)) 33 | .map(t => { 34 | (kb.prob(rel, t), t) 35 | }).sortBy(-_._1) 36 | ).toMap 37 | 38 | println("Reporting predictions...") 39 | 40 | if (true || args.size > 1) { 41 | 42 | val writer = new FileWriter(outFile) 43 | 44 | EntityHackNormalization.init() 45 | 46 | predictions.foreach(t => t._2.take(100).foreach { case (score, es) => 47 | val Array(e1, e2) = es.toString.tail.init.split(",") 48 | val can1 = if (e1.startsWith("/m/")) EntityHackNormalization.getCanonical(e1) else e1 49 | val can2 = if (e2.startsWith("/m/")) EntityHackNormalization.getCanonical(e2) else e2 50 | 51 | writer.write(s"$score\t$e1\t$can1\t$e2\t$can2\t${ t._1 }\n") 52 | }) 53 | writer.close() 54 | } else { 55 | predictions.foreach(t => t._2.take(100).foreach { case (score, es) => 56 | val Array(e1, e2) = es.toString.tail.init.split(",") 57 | println(s"$score\t$e1\t$e2\t${ t._1 }") 58 | }) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/SubsampleExperiments.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import java.io.File 4 | 5 | import ml.wolfe.util.{Conf, OverrideConfig, ProgressBar, RunExperimentSeries} 6 | 7 | import scala.util.Random 8 | 9 | /** 10 | * @author rockt 11 | */ 12 | object SubsampleExperiments extends App { 13 | val threads = args.lift(0).getOrElse("1").toInt 14 | val formulaeFile = args.lift(1).getOrElse("data/formulae/filtered.txt") 15 | val confPath = args.lift(2).getOrElse("conf/mf.conf") 16 | val logFilePath = args.lift(3).getOrElse("data/out/experiments.log") 17 | val runLogFilePath = args.lift(4).getOrElse("data/out/run.log") 18 | val runLogFile = new File(runLogFilePath) 19 | val runLogFileDir = new File(runLogFilePath.split("/").init.mkString("/")) 20 | runLogFileDir.mkdirs() 21 | runLogFile.createNewFile() 22 | 23 | 24 | Conf.add(OverrideConfig(Map("logFile" -> logFilePath), confPath + ".tmp", confPath)) 25 | 26 | val rand = new Random(0l) 27 | 28 | val series = Map( 29 | "mf.subsample" -> (0 to 20).map(_ / 40.0).toSeq, 30 | "mf.mode" -> Seq("mf", "low-rank-logic", "pre-inference", "post-inference", "inference-only"), 31 | "evalConf" -> Seq("eval-subsample.conf") 32 | ).mapValues(rand.shuffle(_)) 33 | 34 | 35 | import scala.sys.process._ 36 | val userDir = System.getProperty("user.dir") 37 | 38 | //first compile project for all workers so that there will be no clashes 39 | Process(Seq("sbt", "compile"), new File(userDir)).! 40 | 41 | 42 | val progressBar = new ProgressBar(series.values.map(_.size).product, 1) 43 | progressBar.start() 44 | 45 | RunExperimentSeries(series, threads, confPath) { conf => 46 | (Process(Seq( 47 | "sbt", 48 | "vmargs -Xmx4G", 49 | s"run-main uclmr.MatrixFactorization $conf"), new File(userDir) 50 | ) #>> runLogFile).!! 51 | 52 | progressBar(conf) 53 | } 54 | 55 | System.exit(0) 56 | } 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/main/scala/uclmr/util/VectorInspector.scala: -------------------------------------------------------------------------------- 1 | package uclmr.util 2 | 3 | import cc.factorie.la._ 4 | import ml.wolfe.FactorieVector 5 | import uclmr.io.LoadNAACL 6 | import uclmr.{TensorKB, Impl, TensorDB} 7 | import ml.wolfe.util.Conf 8 | import ml.wolfe.util.RichCollections._ 9 | 10 | import scala.collection.IterableLike 11 | import scala.collection.generic.CanBuildFrom 12 | 13 | /** 14 | * @author rockt 15 | */ 16 | object VectorInspector extends App { 17 | def calculateLengthsAndAngle(v1: FactorieVector, v2: FactorieVector): (Double, Double, Double) = { 18 | val length1 = math.sqrt(v1 dot v1) 19 | val length2 = math.sqrt(v2 dot v2) 20 | 21 | val angle = math.acos((v1 dot v2) / (length1 * length2)) * (180 / math.Pi) 22 | 23 | (length1, length2, angle) 24 | } 25 | 26 | val pathToDB = args.lift(0).getOrElse("./data/out/F-Joint") 27 | 28 | Conf.add(args.lift(1).getOrElse("./conf/mf.conf")) 29 | 30 | implicit val db = new TensorKB(100) 31 | println("Deserializing DB...") 32 | db.deserialize(pathToDB + "/serialized/") 33 | 34 | val db2 = LoadNAACL() 35 | 36 | val formulae = db2.formulae 37 | //val formulae = Seq(Impl("path#appos|->appos->capital->prep->of->pobj->|pobj", "REL$/location/location/containedby")) 38 | 39 | 40 | def analyzeLengthsAndAngles() = { 41 | println("Analyzing vectors...") 42 | val pathToAnnotatedFormulae = args.lift(1).getOrElse("./data/formulae/1000.txt") 43 | val pathToAllFormulae = args.lift(2).getOrElse("./data/formulae/10000.txt") 44 | 45 | def printStats(premise: String, consequent: String): (Double, Double, Double) = { 46 | val premiseVector = db.vector1(premise).get 47 | val consequentVector = db.vector1(consequent).get 48 | 49 | val (premiseLength, consequentLength, angle) = calculateLengthsAndAngle(premiseVector, consequentVector) 50 | 51 | val correctLength = if (premiseLength < consequentLength) "true" else "false" 52 | 53 | //println("%4.2f°".format(angle) + s"\t$premiseLength\t$consequentLength\t$correctLength") 54 | println("%4.2f°".format(angle) + s"\t$premiseLength\t$consequentLength") 55 | 56 | (premiseLength, consequentLength, angle) 57 | } 58 | 59 | val debug = false 60 | val dropFormulae = 0 61 | val numSamples = 10 62 | 63 | val pairsOfRelations = 64 | if (debug) Seq("path#poss|<-poss<-executive->appos->|appos:INV" -> "REL$/business/person/company") 65 | else { 66 | val annotatedFormulae = io.Source.fromFile(pathToAnnotatedFormulae).getLines().toList.drop(dropFormulae * 3) 67 | .filterNot(l => l.startsWith("//") || l.isEmpty) 68 | .map(_.split(" => ")).map(a => (a(0), a(1))).filterNot(_._2.startsWith("!")) 69 | 70 | val allFormulae = io.Source.fromFile(pathToAllFormulae).getLines().toList 71 | .filterNot(l => l.startsWith("//") || l.isEmpty) 72 | .map(_.split(" => ")).map(a => (a(0), a(1))).filterNot(_._2.startsWith("!")) 73 | 74 | annotatedFormulae.distinctBy(_._2).take(numSamples) ++ allFormulae.take(10000).distinctBy(_._2).takeRight(numSamples) 75 | } 76 | 77 | println(pairsOfRelations.take(numSamples).mkString("\n")) 78 | println() 79 | 80 | println(s"Top $numSamples implications") 81 | val top10Stats = pairsOfRelations.take(numSamples).map(t => printStats(t._1, t._2)) 82 | val top10AvgLengthDiff = top10Stats.map(t => t._2 - t._1).sum / numSamples.toDouble 83 | val top10AvgAngle = top10Stats.map(_._3).sum / numSamples.toDouble 84 | println("Average length difference: " + top10AvgLengthDiff) 85 | println("Average angle: " + top10AvgAngle) 86 | println() 87 | println(s"Bottom $numSamples implications") 88 | val least10Stats = pairsOfRelations.takeRight(numSamples).map(t => printStats(t._1, t._2)) 89 | val least10AvgLengthDiff = least10Stats.map(t => t._2 - t._1).sum / numSamples.toDouble 90 | val least10AvgAngle = least10Stats.map(_._3).sum / numSamples.toDouble 91 | println("Average length difference: " + least10AvgLengthDiff) 92 | println("Average angle: " + least10AvgAngle) 93 | } 94 | 95 | val entityPairs = db.trainCells.map(_.key2).distinct.map(List(_)) 96 | 97 | def analyzeAsymmetry() = { 98 | val tmp = formulae.map { 99 | case Impl(p1, p2, _) => 100 | val p1Vector = db.vector1(p1).get 101 | val p2Vector = db.vector1(p2).get 102 | val (p1Length, p2Length, angle) = VectorInspector.calculateLengthsAndAngle(p1Vector, p2Vector) 103 | val lengthDiff = p2Length - p1Length 104 | //val (mfScore, numPremises) = FormulaeExtractor.formulaScoreMF(Impl(p1, p2), entityPairs) 105 | //val (mfScoreInv, numPremisesInv) = FormulaeExtractor.formulaScoreMF(Impl(p2, p1), entityPairs) 106 | val (mfScore, numPremises) = FormulaeExtractor.formulaScoreMFPredicted(Impl(p1, p2), entityPairs) 107 | val (mfScoreInv, numPremisesInv) = FormulaeExtractor.formulaScoreMFPredicted(Impl(p2, p1), entityPairs) 108 | 109 | //println(p1 + "\t" + p1Vector.toArray.mkString("\t")) 110 | //println(p2 + "\t" + p2Vector.toArray.mkString("\t")) 111 | 112 | val maxPremise = ArgMaxSigmoid(p1Vector) 113 | val maxConsequent = ArgMaxSigmoid(p2Vector) 114 | 115 | /* 116 | println("permise(p-premise):\t" + ArgMaxSigmoid.sig(maxPremise dot p1Vector)) 117 | println("consequent(p-premise):\t" + ArgMaxSigmoid.sig(maxPremise dot p2Vector)) 118 | println("premise(p-consequent):\t" + ArgMaxSigmoid.sig(maxConsequent dot p1Vector)) 119 | println("consequent(p-consequent):\t" + ArgMaxSigmoid.sig(maxConsequent dot p2Vector)) 120 | */ 121 | 122 | //println(f"%%4.2f [%%d]\t%%4.2f [%%d]\t%%6.4f\t%%6.4f\t%%6.4f\t%%4.2f°\t$p1\t$p2".format( 123 | println(f"%%4.2f [%%d]\t%%4.2f [%%d]\t%%6.4f\t%%6.4f\t%%6.4f\t%%4.2f°\t${ArgMaxSigmoid.sig(maxPremise dot p2Vector)}\t${ArgMaxSigmoid.sig(maxConsequent dot p1Vector)}\t$p1\t$p2".format( 124 | mfScore, numPremises, mfScoreInv, numPremisesInv, p1Length, p2Length, lengthDiff, angle 125 | )) 126 | 127 | (mfScore, mfScoreInv, angle, lengthDiff) 128 | } 129 | 130 | 131 | println("Avg A=>B score:\t" + (tmp.map(_._1).sum / tmp.length.toDouble)) 132 | println("Avg B=>A score:\t" + (tmp.map(_._2).sum / tmp.length.toDouble)) 133 | println("Avg angle:\t" + (tmp.map(_._3).sum / tmp.length.toDouble)) 134 | println("Avg length diff:\t" + (tmp.map(_._4).sum / tmp.length.toDouble)) 135 | } 136 | 137 | //analyzeLengthsAndAngles() 138 | analyzeAsymmetry() 139 | } 140 | 141 | object VectorInspectorSpec extends App { 142 | println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(3.0, 0.0)), new DenseTensor1(Array(5.0, 5.0)))._3) 143 | println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(3.0, 4.0)), new DenseTensor1(Array(-8.0, 6.0)))._3) 144 | println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(5.0, 6.0)), new DenseTensor1(Array(-1.0, 4.0)))._3) 145 | println(VectorInspector.calculateLengthsAndAngle(new DenseTensor1(Array(3.0, 5.0)), new DenseTensor1(Array(-1.0, 6.0)))._3) 146 | } -------------------------------------------------------------------------------- /src/test/scala/uclmr/PotentialsSpec.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import ml.wolfe.fg.{BPRPotential, CellLogisticLoss, L2Regularization, VectorMsgs} 4 | import ml.wolfe.util.PotentialDebugger 5 | import ml.wolfe.{DenseVector, FactorGraph} 6 | 7 | import scala.util.Random 8 | 9 | /** 10 | * @author rockt 11 | */ 12 | object PotentialsSpec extends App { 13 | //building factor graph 14 | val fg = new FactorGraph() 15 | val n1 = fg.addVectorNode(100, "c") 16 | val n2 = fg.addVectorNode(100, "p1") 17 | val n3 = fg.addVectorNode(100, "p2") 18 | val n4 = fg.addVectorNode(100, "c2") 19 | 20 | val lambda = 0.01 21 | 22 | fg.buildFactor(Seq(n1, n2))(_ map (_ => new VectorMsgs)) { 23 | e => new CellLogisticLoss(e(0), e(1), 1.0, lambda) with L2Regularization 24 | } 25 | 26 | fg.buildFactor(Seq(n1, n3))(_ map (_ => new VectorMsgs)) { 27 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda) with L2Regularization 28 | } 29 | 30 | fg.buildFactor(Seq(n1, n3))(_ map (_ => new VectorMsgs)) { 31 | e => new CellLogisticLoss(e(0), e(1), 0.0, lambda, 0.5) with L2Regularization 32 | } 33 | 34 | fg.buildFactor(Seq(n1, n2, n3))(_ map (_ => new VectorMsgs)) { 35 | e => new ImplPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization 36 | } 37 | 38 | fg.buildFactor(Seq(n1, n2, n3))(_ map (_ => new VectorMsgs)) { 39 | e => new ImplPotential(e(0), e(1), e(2), 1.0, lambda, 10.0) with L2Regularization 40 | } 41 | 42 | fg.buildFactor(Seq(n1, n3, n2))(_ map (_ => new VectorMsgs)) { 43 | e => new ImplNegPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization 44 | } 45 | 46 | fg.buildFactor(Seq(n1, n3, n2))(_ map (_ => new VectorMsgs)) { 47 | e => new ImplNegPotential(e(0), e(1), e(2), 1.0, lambda, 0.5) with L2Regularization 48 | } 49 | 50 | 51 | fg.buildFactor(Seq(n1, n4, n2))(_ map (_ => new VectorMsgs)) { 52 | e => new BPRPotential(e(0), e(1), e(2), 1.0, lambda) with L2Regularization 53 | } 54 | 55 | fg.build() 56 | 57 | //initializing weights and messages 58 | val rand = new Random(0l) 59 | def nextInit() = rand.nextGaussian() * 0.1 60 | Seq(n1, n2, n3, n4).foreach(n => { 61 | val vec = new DenseVector((0 until 100).map(i => nextInit()).toArray) 62 | n.variable.asVector.b = vec 63 | n.variable.asVector.setting = vec 64 | }) 65 | fg.factors.foreach(_.edges.foreach(e => e.msgs.asVector.n2f = e.n.variable.asVector.b)) 66 | 67 | //gradient checking 68 | fg.factors.map(_.potential).foreach(PotentialDebugger.checkGradients(_, debug = true)) 69 | } 70 | -------------------------------------------------------------------------------- /src/test/scala/uclmr/TensorDBSpec.scala: -------------------------------------------------------------------------------- 1 | package uclmr 2 | 3 | import org.scalatest.{Matchers, WordSpec} 4 | 5 | /** 6 | * Created by rockt on 19/09/2014. 7 | */ 8 | class TensorDBSpec extends WordSpec with Matchers { 9 | "A tensor DB" should { 10 | "add and retrieve cells" in { 11 | val db = new TensorDB() 12 | //vector 13 | db += Cell(Seq("a")) 14 | db.get(Seq("a")) shouldBe Some(Cell(Seq("a"))) 15 | 16 | db += Cell(Seq("a", "b")) 17 | db.get(Seq("a", "b")) shouldBe Some(Cell(Seq("a", "b"))) 18 | 19 | //matrix 20 | db += Cell(1, 2) 21 | db.get(1, 2) shouldBe Some(Cell(1, 2)) 22 | 23 | //tensor 24 | db += Cell(1, 2, 3) 25 | db.get(1,2,3) shouldBe Some(Cell(1,2,3)) 26 | 27 | } 28 | 29 | "be a matrix if cells are indexed by exactly two indices" in { 30 | val db = new TensorDB() 31 | db.isMatrix shouldBe false 32 | 33 | db += Cell("r") 34 | db.isMatrix shouldBe false 35 | 36 | db += Cell("r1", "e1") 37 | db.isMatrix shouldBe true 38 | 39 | db += Cell("r2", "e1", "e2") 40 | db.isMatrix shouldBe false 41 | } 42 | 43 | "be usable in a natural way for a knowledge base with binary relations" in { 44 | val matrix = new TensorDB() 45 | matrix.sampleTensor(10,5) 46 | println(matrix.toVerboseString(showTrain = true)) 47 | 48 | val tensor = new TensorDB() 49 | tensor.sampleTensor(10,5,5) 50 | 51 | println(tensor.toVerboseString(showTrain = true)) 52 | } 53 | 54 | "be serializable and deserializable" in { 55 | val db = new TensorDB(5) 56 | db.sampleTensor(10, 10, 0, 0.1) //samples a matrix 57 | db += Impl("r3", "r4") 58 | db += Impl("r4", "r6") 59 | db += Impl("r6", "r2") 60 | 61 | val fg = db.toFactorGraph 62 | fg.build() 63 | 64 | db.serialize("/tmp/serialized/") 65 | 66 | 67 | val db2 = new TensorDB(5) 68 | db2.deserialize("/tmp/serialized/") 69 | 70 | db.cells.size == db2.cells.size 71 | db.keys1.size == db2.keys1.size 72 | db.keys2.size == db2.keys2.size 73 | db.keys3.size == db2.keys3.size 74 | } 75 | } 76 | } 77 | 78 | --------------------------------------------------------------------------------