├── src
├── eval
│ ├── __init__.py
│ ├── evalF1.py
│ ├── evalDendPurity.py
│ ├── evalMUCF1.py
│ ├── finalEval.py
│ └── threshold.py
├── trainer
│ ├── __init__.py
│ ├── BaseTrainer.py
│ ├── train_vect_data.py
│ ├── train_pair_feat.py
│ └── scipy_perceptron.py
├── utils
│ ├── __init__.py
│ ├── mds.py
│ ├── fixNPCorefDataFormat.py
│ ├── process_aminer_stats.py
│ ├── processADANA.py
│ ├── createNPDataset.py
│ ├── projectFaces.py
│ ├── processRexa.py
│ ├── Config.py
│ ├── create_synth_dataset.py
│ └── combineResults.py
├── hier_clust
│ ├── __init__.py
│ ├── random_split.py
│ └── recursive_sparsest_cut.py
└── models
│ ├── __init__.py
│ ├── templateClassifier.py
│ ├── linearClassifier.py
│ └── mahalabonis.py
├── bin
├── revSyncResults.sh
├── run.sh
├── syncResults.sh
├── runDiffSeed.sh
├── setup.sh
└── compileResults.sh
├── NOTICE.txt
├── config
├── synth
│ └── spiral.json
├── rexa
│ ├── linkage_0.json
│ ├── linkage_min.json
│ ├── triplet.json
│ ├── linkage_auto.json
│ ├── linkage_max.json
│ ├── allWithin_allAcross.json
│ ├── bestWithin_bestAcross.json
│ └── mstWithin_bestAcross.json
├── NP_Coref
│ ├── triplet.json
│ ├── linkage_0.json
│ ├── linkage_auto.json
│ ├── linkage_max.json
│ ├── linkage_min.json
│ ├── allWithin_allAcross.json
│ ├── mstWithin_bestAcross.json
│ └── bestWithin_bestAcross.json
├── authorCoref
│ ├── linkage_0.json
│ ├── triplet.json
│ ├── linkage_auto.json
│ ├── linkage_max.json
│ ├── linkage_min.json
│ ├── allWithin_allAcross.json
│ ├── bestWithin_bestAcross.json
│ └── mstWithin_bestAcross.json
└── faceData_20.tsv
│ ├── triplet.json
│ ├── linkage_auto.json
│ ├── linkage_max.json
│ ├── linkage_min.json
│ ├── linkage_0.json
│ ├── allWithin_allAcross.json
│ ├── mstWithin_minAcross.json
│ └── minWithin_minAcross.json
├── .gitignore
├── resources
└── line_styles.json
├── README.md
├── LICENSE
└── env.yml
/src/eval/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/trainer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/hier_clust/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bin/revSyncResults.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | dir=$1
3 | while true
4 | do
5 | rsync ../$dir/ -avzi blake:/iesl/canvas/nishantyadav/clustering/$dir/
6 | sleep 60
7 | done
8 |
--------------------------------------------------------------------------------
/bin/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -xu
4 |
5 | allCommand=
6 | while [ "$#" -gt 0 ];
7 | do
8 | allCommand=" $allCommand $1 "
9 | shift
10 | done
11 |
12 | $allCommand
13 |
--------------------------------------------------------------------------------
/bin/syncResults.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -xu
3 |
4 | dir=$1
5 | time=$2
6 | while true
7 | do
8 | rsync -avzi blake:/iesl/canvas/nishantyadav/clustering/$dir/ ../$dir/
9 | sleep $time
10 | done
11 |
--------------------------------------------------------------------------------
/bin/runDiffSeed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -xu
4 | startSeed=$1
5 | shift
6 | endSeed=$1
7 | shift
8 | command=$1
9 |
10 | for seed in $(seq $startSeed $endSeed)
11 | do
12 | echo $seed
13 | $command --seed=$seed
14 | done
--------------------------------------------------------------------------------
/bin/setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #set -xu
4 |
5 | export ROOT_DIR=`pwd`
6 | export PYTHONPATH=$ROOT_DIR/src:$PYTHONPATH
7 | export XCLUSTER_ROOT=$ROOT_DIR/../xcluster
8 | export XCLUSTER_JARPATH=$XCLUSTER_ROOT/target/xcluster-0.1-SNAPSHOT-jar-with-dependencies.jar
9 |
--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | expLinkage
2 |
3 | This software is Copyright (C) 2019 University of Massachusetts
4 | Amherst, College of Information and Computer Sciences, and is licensed under the
5 | terms of the Apache License, Version 2.0 (see LICENSE.txt) or (at your option) any subsequent version.
6 |
7 | The license is approved by the Open Source Initiative, and is available
8 | from their website at http://www.opensource.org.
9 |
--------------------------------------------------------------------------------
/config/synth/spiral.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/spiralSmallRotated",
13 | "dataDir" : "../data/spiralSmallRotated",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 2,
22 | "margin" : 1,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 1.0,
37 | "testFrac" : 0.0,
38 | "devFrac" : 0.0,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink@t"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "linear",
46 | "model" : "AvgPerceptron",
47 | "inputDim" : 2,
48 | "outDisSim" : true,
49 | "lr" : 0.1,
50 | "l2Alpha" : 0.1,
51 | "alphaLr" : 0.01,
52 | "alphaInitMu" : 0.0,
53 | "alphaInitSigma": 0.1,
54 | "idenInit" : false
55 |
56 | }
--------------------------------------------------------------------------------
/config/rexa/linkage_0.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_0",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.1,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/rexa/linkage_min.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_min",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
42 | "metricsForEval" : ["f1", "dendPurity"],
43 |
44 | "modelType" : "avgLinear",
45 | "inputDim" : 14,
46 | "outDisSim" : true,
47 | "lr" : 0.1,
48 | "l2Alpha" : 0.01,
49 | "alphaLr" : 0.2,
50 | "alphaInitMu" : 0.0,
51 | "alphaInitSigma": 0.1,
52 | "idenInit" : false
53 |
54 | }
--------------------------------------------------------------------------------
/config/rexa/triplet.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "triplet",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 100,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.1,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/rexa/linkage_auto.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_auto",
19 | "trainModel" : true,
20 | "trainAlpha" : true,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.1,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/rexa/linkage_max.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_max",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.1,
49 | "l2Alpha" : 0.005,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/triplet.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "triplet",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 100,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.1,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/linkage_0.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_0",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.01,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/linkage_auto.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_auto",
19 | "trainModel" : true,
20 | "trainAlpha" : true,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.01,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/linkage_max.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_max",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.005,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/linkage_min.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_min",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.01,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : -5.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/linkage_0.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_0",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/triplet.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "triplet",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 100,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/linkage_auto.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_auto",
19 | "trainModel" : true,
20 | "trainAlpha" : true,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/linkage_max.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_max",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/linkage_min.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_min",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/rexa/allWithin_allAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "allWithin_allAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/rexa/bestWithin_bestAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "bestWithin_bestAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.1,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/rexa/mstWithin_bestAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "rexa",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/rexa",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "mstWithin_bestAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 500,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.375,
37 | "testFrac" : 0.375,
38 | "devFrac" : 0.25,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 14,
47 | "outDisSim" : true,
48 | "lr" : 0.1,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.2,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/allWithin_allAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "allWithin_allAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.1,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/mstWithin_bestAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "mstWithin_bestAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.02,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.05,
51 | "alphaInitMu" : -5.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/NP_Coref/bestWithin_bestAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "NP_Coref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/NP_Coref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "bestWithin_bestAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 100,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 102,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.05,
51 | "alphaInitMu" : -5.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/allWithin_allAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "allWithin_allAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.005,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/bestWithin_bestAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "bestWithin_bestAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/authorCoref/mstWithin_bestAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "authorCoref",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "",
13 | "dataDir" : "../data/authorCoref",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "mstWithin_bestAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 0.0,
22 | "margin" : 2.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": true,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 200,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 1000,
32 | "epochToWrite" : 1000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.6,
37 | "testFrac" : 0.3,
38 | "devFrac" : 0.1,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "avgLinear",
46 | "inputDim" : 8,
47 | "outDisSim" : true,
48 | "lr" : 0.05,
49 | "l2Alpha" : 0.001,
50 | "alphaLr" : 0.005,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/triplet.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "triplet",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 100,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.1,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/linkage_auto.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_auto",
19 | "trainModel" : true,
20 | "trainAlpha" : true,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/linkage_max.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_max",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/linkage_min.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_min",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/linkage_0.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces_linkage_0",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "linkage_0",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/allWithin_allAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "allWithin_allAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.1,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/mstWithin_minAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "mstWithin_minAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/config/faceData_20.tsv/minWithin_minAcross.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "config_name": "faces",
4 |
5 | "cuda" : true,
6 | "seed" : 1234,
7 |
8 | "mode" : "train",
9 | "resultDir" : "auto",
10 | "newDirSuffix" : "",
11 |
12 | "clusterFile" : "../data/faceData_20.tsv",
13 | "dataDir" : "../data/faceData_20.tsv",
14 | "logFile" : "logFile.txt",
15 | "bestModel" : "",
16 | "logConsole" : true,
17 |
18 | "trainObj" : "minWithin_minAcross",
19 | "trainModel" : true,
20 | "trainAlpha" : false,
21 | "threshold" : 100.0,
22 | "margin" : 10.0,
23 | "normalizeLoss" : false,
24 | "normExpLinkLoss": false,
25 | "trainExpLink" : true,
26 | "scaleDist" : false,
27 | "numErrorTriplet": 0,
28 |
29 | "numEpoch" : 1000,
30 | "numEpToAvg" : 10,
31 | "epochToEval" : 10000,
32 | "epochToWrite" : 10000,
33 | "epsilon" : 0.0001,
34 | "makeScorePlots": false,
35 |
36 | "trainFrac" : 0.35,
37 | "testFrac" : 0.35,
38 | "devFrac" : 0.3,
39 | "shuffleData" : true,
40 |
41 |
42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"],
43 | "metricsForEval" : ["f1", "dendPurity"],
44 |
45 | "modelType" : "maha",
46 | "inputDim" : 20,
47 | "outDisSim" : true,
48 | "lr" : 0.001,
49 | "l2Alpha" : 0.01,
50 | "alphaLr" : 0.01,
51 | "alphaInitMu" : 0.0,
52 | "alphaInitSigma": 0.1,
53 | "idenInit" : false
54 |
55 | }
--------------------------------------------------------------------------------
/bin/compileResults.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #sh bin/compileResults.sh rexa 11 30 "" bestWithin_bestAcross mstWithin_bestAcross linkage_min allWithin_allAcross triplet linkage_0 linkage_max linkage_auto
3 |
4 | set -xu
5 |
6 | ############################################### FOR COMPARING DIFFERENT OBJECTIVES ON VARYYING SEEDS ########################################################
7 |
8 | res_root=../results_refactor
9 |
10 | data=$1
11 | shift
12 |
13 | seedStart=$1
14 | shift
15 |
16 | seedEnd=$1
17 | shift
18 |
19 | suffix=$1
20 | shift
21 |
22 | seeds=$(seq $seedStart $seedEnd)
23 |
24 | allObj=
25 | while [ "$#" -gt 0 ];
26 | do
27 | obj=$1
28 | shift
29 | allObj=" $allObj $obj "
30 |
31 | python -m utils.combineResults --outDirPrefix=BestDevThresh --baseResDir=$res_root/d\=$data --relResultDir=BestDevThresh --xlabel=Threshold --config=config/$data/$obj.json --seed $seeds --suffix=$suffix
32 | # python -m utils.combineResults --outDirPrefix=BestTestThresh --baseResDir=$res_root/d\=$data --relResultDir=BestTestThresh --xlabel=Threshold --config=config/$data/$obj.json --seed $seeds --suffix=$suffix
33 |
34 | done
35 |
36 | python -m utils.compareMethods --baseResDir=$res_root/d\=$data --outDirPrefix=BestDevThresh --trainObj $allObj --xlabel=Threshold --seed $seeds --suffix=$suffix
37 | #python -m utils.compareMethods --baseResDir=$res_root/d\=$data --outDirPrefix=BestTestThresh --trainObj $allObj --xlabel=Threshold --seed $seeds --suffix=$suffix
38 |
39 | #####################################################################################################################################################################
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | */.DS_Store
2 | /.DS_Store
3 | .idea/
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # pyenv
80 | .python-version
81 |
82 | # celery beat schedule file
83 | celerybeat-schedule
84 |
85 | # SageMath parsed files
86 | *.sage.py
87 |
88 | # Environments
89 | .env
90 | .venv
91 | env/
92 | venv/
93 | ENV/
94 | env.bak/
95 | venv.bak/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 | .spyproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 |
104 | # mkdocs documentation
105 | /site
106 |
107 | # mypy
108 | .mypy_cache/
109 |
--------------------------------------------------------------------------------
/src/utils/mds.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | from sklearn.datasets import load_digits
17 | from sklearn.manifold import MDS
18 | import random, itertools
19 |
20 | from utils.plotting import plot_clusters
21 |
22 | def runMDSDummy():
23 | X, _ = load_digits(return_X_y=True)
24 | embedding = MDS(n_components=2)
25 | X_transformed = embedding.fit_transform(X[:100])
26 |
27 | pointToCluster = {}
28 | for x in X_transformed:
29 | pointToCluster[tuple(x)] = random.randint(0, 2)
30 |
31 | edges = []
32 | for p1, p2 in itertools.combinations(pointToCluster, 2):
33 | if pointToCluster[p1] == pointToCluster[p2]:
34 | edges += [(p1[0], p1[1], p2[0], p2[1])]
35 |
36 | plot_clusters(pointToCluster=pointToCluster, filename='../results/testMDS.png', edgeList=edges)
37 |
38 | def runMDS(simMatrix, pidToCluster,filename):
39 |
40 | embedding = MDS(n_components=2,dissimilarity='precomputed')
41 | X_transformed = embedding.fit_transform(simMatrix)
42 |
43 | pointToCluster = {}
44 | for pid,x in enumerate(X_transformed):
45 | pointToCluster[tuple(x)] = pidToCluster[pid]
46 |
47 | numPoints = len(pidToCluster)
48 | edges = []
49 | for p1,p2 in itertools.combinations( pointToCluster, 2):
50 | if pointToCluster[p1] == pointToCluster[p2]:
51 | edges += [(p1[0],p1[1],p2[0],p2[1])]
52 |
53 | plot_clusters(pointToCluster=pointToCluster, filename='../results/testMDS_{}.png'.format(filename), edgeList=edges)
54 | plot_clusters(pointToCluster=pointToCluster, filename='../results/testMDSWithout_{}.png'.format(filename))
55 |
56 |
57 | if __name__ == "__main__":
58 |
59 |
60 | # runMDS()
61 | pass
--------------------------------------------------------------------------------
/resources/line_styles.json:
--------------------------------------------------------------------------------
1 | {
2 | "test": {
3 | "style": {
4 | "label": "Test",
5 | "color": "orange",
6 | "mec": "orange",
7 | "marker": "*",
8 | "ls": "-",
9 | "lw": 2,
10 | "mew": 2,
11 | "ms": 6,
12 | "elinewidth": 0.1
13 | },
14 | "fill style": {
15 | "color": "orange",
16 | "alpha": 0.3
17 | }
18 | },
19 | "dev": {
20 | "style": {
21 | "label": "Dev",
22 | "color": "salmon",
23 | "mec": "salmon",
24 | "marker": "*",
25 | "ls": "-",
26 | "lw": 2,
27 | "mew": 2,
28 | "ms": 6,
29 | "elinewidth": 0.1
30 | },
31 | "fill style": {
32 | "color": "salmon",
33 | "alpha": 0.3
34 | }
35 | },
36 | "dev_euclid": {
37 | "style": {
38 | "label": "Dev(Euclidean)",
39 | "color": "lightseagreen",
40 | "mec": "lightseagreen",
41 | "marker": "*",
42 | "ls": "-",
43 | "lw": 2,
44 | "mew": 2,
45 | "ms": 6,
46 | "elinewidth": 0.1
47 | },
48 | "fill style": {
49 | "color": "lightseagreen",
50 | "alpha": 0.3
51 | }
52 | },
53 | "test_euclid": {
54 | "style": {
55 | "label": "Test(Euclidean)",
56 | "color": "olive",
57 | "mec": "olive",
58 | "marker": "*",
59 | "ls": "-",
60 | "lw": 2,
61 | "mew": 2,
62 | "ms": 6,
63 | "elinewidth": 0.1
64 | },
65 | "fill style": {
66 | "color": "olive",
67 | "alpha": 0.3
68 | }
69 | },
70 | "train": {
71 | "style": {
72 | "label": "Train",
73 | "color": "maroon",
74 | "mec": "maroon",
75 | "marker": "o",
76 | "ls": "-",
77 | "lw": 2,
78 | "mew": 2,
79 | "ms": 6,
80 | "elinewidth": 0.1
81 | },
82 | "fill style": {
83 | "color": "red",
84 | "alpha": 0.3
85 | }
86 | },
87 | "train_euclid": {
88 | "style": {
89 | "label": "Train(Euclidean)",
90 | "color": "darkgreen",
91 | "mec": "darkgreen",
92 | "marker": "o",
93 | "ls": "-",
94 | "lw": 2,
95 | "mew": 2,
96 | "ms": 6,
97 | "elinewidth": 0.1
98 | },
99 | "fill style": {
100 | "color": "darkgreen",
101 | "alpha": 0.3
102 | }
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/utils/fixNPCorefDataFormat.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | from pathlib import Path
17 |
18 | def fixDataFormat(origDir ,newDir):
19 |
20 | Path(newDir).mkdir(parents=True, exist_ok=True)
21 |
22 | canopyList = sorted([str(f).split("/")[-1] for f in Path(origDir).glob("*") if f.is_dir()])
23 |
24 | print("CanopyList:",canopyList)
25 |
26 | for canopy in canopyList:
27 | Path("{}/{}".format(newDir,canopy)).mkdir(parents=True, exist_ok=True)
28 |
29 | with open("{}/{}/pairFeatures.csv".format(origDir,canopy),'r') as origFile:
30 | with open("{}/{}/pairFeatures.csv".format(newDir,canopy),'w') as newFile:
31 | for line in origFile:
32 | lineV = line.strip().split(",")
33 | if lineV[-1] == "+":
34 | lineV[-1] = "1"
35 | elif lineV[-1] == "-":
36 | lineV[-1] = "0"
37 | else:
38 | raise Exception("Invalid last token ..",lineV)
39 |
40 | processedLine = ",".join([str(v) for v in lineV[1:]]) # Exclude doc number
41 |
42 | newFile.write(processedLine +"\n")
43 |
44 | with open("{}/{}/gtClusters.tsv".format(origDir,canopy),'r') as origFile:
45 | with open("{}/{}/gtClusters.tsv".format(newDir,canopy),'w') as newFile:
46 | for line in origFile:
47 | newFile.write(line)
48 |
49 | if __name__ == '__main__':
50 | origDir = "../data/NP_Coref_withDocNum"
51 | newDir = "../data/NP_Coref"
52 |
53 | # "This was to remove docNum present in front of each line in pairFeatures.tsv
54 | fixDataFormat(origDir=origDir, newDir=newDir)
--------------------------------------------------------------------------------
/src/hier_clust/random_split.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import numpy as np
17 | import time
18 | from hier_clust.expLink import getPidToPredClusters, computeDendPurity
19 |
20 |
21 | def run_random_split(pidToCluster, k=None):
22 |
23 | numPoints = len(pidToCluster)
24 | activeClusters = [pid for pid in range(numPoints)]
25 | newCid = numPoints
26 |
27 | pidToParent = {}
28 | children = {pid:None for pid in activeClusters}
29 |
30 | y_pred = None
31 | while len(activeClusters) > 1:
32 |
33 | # Find clusters to merge
34 |
35 | cs = np.random.choice(activeClusters, 2, replace=False) # Random clusters to merge
36 | c1 = cs[0]
37 | c2 = cs[1]
38 |
39 | # Remove merged clusters for list
40 | activeClusters.remove(c1)
41 | activeClusters.remove(c2)
42 |
43 | # Update distances of the merged cluster with all remaining clusters
44 | activeClusters.append(newCid)
45 |
46 | children[newCid] = (c1,c2)
47 | pidToParent[c1] = newCid
48 | pidToParent[c2] = newCid
49 |
50 | if k is not None and len(activeClusters) == k: # Get flat clusters such that there are k clusters
51 | pidToPredCluster_k = getPidToPredClusters(numPoints=numPoints, pidToParent=pidToParent)
52 | y_pred = [pidToPredCluster_k[ pid ] for pid in range(numPoints)]
53 |
54 | newCid += 1
55 |
56 | if y_pred is None: # This is triggered when while loop terminated without forming flat clusters. it means that all points are put in 1 cluster
57 | y_pred = [1 for x in range(numPoints)]
58 |
59 | if pidToCluster is None:
60 | dendPurity = 0
61 | else:
62 | dendPurity = computeDendPurity(pidToCluster=pidToCluster, children=children, pidToParent=pidToParent)
63 |
64 |
65 | return y_pred, dendPurity
66 |
67 |
--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | from utils.Config import Config
17 | from models.linearClassifier import LinearClassifier, AvgLinearClassifier
18 | from models.templateClassifier import Classifier
19 | from models.mahalabonis import GenLinkMahalanobis
20 |
21 | def create_new_model(config):
22 | """ Create a new model object based on the modelType field in the config
23 |
24 | :param config:
25 | :return: New created object
26 | """
27 | assert isinstance(config,Config)
28 | if config.modelType == "linear": # Learn a pairwise classifier
29 | model = LinearClassifier(config)
30 | elif config.modelType == "avgLinear": # Learn a pairwise classifier and uses avgWeights at the end of training
31 | model = AvgLinearClassifier(config)
32 | elif config.modelType == "maha":
33 | model = GenLinkMahalanobis(config)
34 | elif config.modelType == "template":
35 | model = Classifier(config) # This class is just a template to use with skLearn classifiers with current code setup
36 | else:
37 | raise Exception("Unknown Model: {}".format(config.modelType))
38 |
39 | return model
40 |
41 | # def load_model(config):
42 | # """ Load model object using the bestModel field in the config
43 | #
44 | # :param config:
45 | # :return:Loaded Model Object
46 | # """
47 | # assert isinstance(config,Config)
48 | # if config.modelType == "linear": # Learn a pairwise classifier
49 | # model = LinearClassifier.load(config.bestModel)
50 | # elif config.modelType == "avgLinear": # Learn a pairwise classifier and uses avgWeights at the end of training
51 | # model = AvgLinearClassifier.load(config.bestModel)
52 | # elif config.modelType == "template":
53 | # model = Classifier() # This class is just a template to use with skLearn classifiers with current code setup
54 | # else:
55 | # raise Exception("Unknown Model: {}".format(config.modelType))
56 | #
57 | # return model
58 | #
--------------------------------------------------------------------------------
/src/models/templateClassifier.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 |
17 | import torch
18 | import numpy as np
19 |
20 | class Classifier(torch.nn.Module):
21 |
22 | def __init__(self,config):
23 | super(Classifier, self).__init__()
24 |
25 | self.config = config
26 | self.clf = None
27 | self.seqModel = torch.nn.Sequential(
28 | torch.nn.Linear(self.config.inputDim,self.config.inputDim)
29 | )
30 |
31 | def __str__(self):
32 | printStr = ""
33 | printStr += "-----------------Classifier Parameters-----------------------------" + "\n"
34 | printStr += str(self.clf)
35 | printStr += "-------------------------------------------------------------------"
36 | return printStr
37 |
38 | def getWeightStr(self):
39 | return "\n\nNo parameters\n\n"
40 |
41 | def pairForward(self, pairFeature):
42 | raise NotImplementedError
43 | # prediction = self.clf.predict(pairFeature)
44 | # return torch.autograd.Variable(torch.FloatTensor(prediction),requires_grad=False)
45 |
46 | def pairBatchForward(self, pairFeatureList):
47 | prediction = self.clf.predict(pairFeatureList)
48 | prediction = torch.FloatTensor(prediction).view(-1,1)
49 | return torch.autograd.Variable(prediction, requires_grad=False)
50 |
51 | def forward(self, point1, point2):
52 | raise NotImplementedError
53 |
54 | # This function does not return a pytorch Variable.
55 | # Just the distance between point1 and point2 as per current model
56 | def forwardPlain(self, point1, point2):
57 | raise NotImplementedError
58 |
59 | # Takes list of points and returns an adjacency matrix for it of size n x n
60 | def batchForwardWithin(self, points):
61 | raise NotImplementedError
62 |
63 | # Takes list of 2 points and returns an adjacency matrix for them of size n1 x n2
64 | def batchForwardAcross(self, pointList1, pointList2):
65 | raise NotImplementedError
66 |
67 | def batchForwardOneToOne(self, pointList1, pointList2):
68 | raise NotImplementedError
69 |
70 |
71 |
72 | if __name__ == '__main__':
73 | torch.manual_seed(2)
74 | np.random.seed(1)
75 | print("There is no code to run here...")
--------------------------------------------------------------------------------
/src/utils/process_aminer_stats.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import math
17 | import numpy as np
18 | from utils.basic_utils import read_canopy_data
19 | import json
20 | from collections import defaultdict
21 | import itertools,csv
22 |
23 | def run(dataDir):
24 |
25 | canopyData = read_canopy_data(dataDir)
26 |
27 | all_measures= defaultdict(dict)
28 | for canopyId in canopyData:
29 | canopy = canopyData[canopyId]
30 | numEnts = len(canopy["clusterToPids"])
31 | numMents = len(canopy["pidToCluster"])
32 | avgMents = np.mean( [len(canopy["clusterToPids"][c]) for c in canopy["clusterToPids"]] )
33 | stdMents = np.std( [len(canopy["clusterToPids"][c]) for c in canopy["clusterToPids"]] )
34 | numSingletons = sum([1 for c in canopy["clusterToPids"] if len(canopy["clusterToPids"][c]) == 1 ])
35 | all_measures["numEnts"][canopyId] = numEnts
36 | all_measures["numMents"][canopyId] = numMents
37 | all_measures["avgMents"][canopyId] = avgMents
38 | all_measures["stdMents"][canopyId] = stdMents
39 | all_measures["numSingletons"][canopyId] = numSingletons
40 |
41 | for measure in all_measures:
42 | json.dump(all_measures[measure], open("resources/aminer/aminer_{}.json".format(measure),"w"))
43 |
44 | all_measures["origin"] = json.load(open("resources/aminer/aminer_origin.json","r"))
45 |
46 | corrCoeff = {}
47 |
48 | for m1,m2 in itertools.combinations_with_replacement(all_measures,2):
49 | canopies = list(all_measures[m1].keys())
50 | X_1 = [all_measures[m1][c] for c in canopies]
51 | X_2 = [all_measures[m2][c] for c in canopies]
52 | corrCoeff[m1,m2] = np.corrcoef(X_1, X_2)[0, 1]
53 | corrCoeff[m2,m1] = np.corrcoef(X_1, X_2)[1, 0]
54 |
55 | mlist = list(all_measures.keys())
56 | with open("resources/aminer/aminer_correlation.csv","w") as f:
57 | f = csv.DictWriter( f,["Method"]+ mlist )
58 | f.writeheader()
59 | for m1 in mlist:
60 | row = {"Method":m1}
61 | for m2 in mlist:
62 | row[m2] = "{:.3f}".format(corrCoeff[m1,m2])
63 |
64 | f.writerow(row)
65 |
66 |
67 |
68 |
69 |
70 |
71 | if __name__ == "__main__":
72 | run("../data/authorCoref")
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Supervised Hierarchical Clustering with Exponential Linkage
2 | This repository contains code used in experiments for our ICML 2019 paper titled "[Supervised Hierarchical Clustering with Exponential Linkage](http://proceedings.mlr.press/v97/yadav19a.html)".
3 |
4 | ## Setup ##
5 |
6 | Clone* and setup **xcluster** repository from .
7 | Make sure **xcluster** repo is cloned in the same folder as this repo i.e. you should have **xcluster** and **expLinkage** folder in the same parent folder.
8 |
9 | Set environment variables:
10 |
11 | ```
12 | cd expLinkage
13 | source bin/setup.sh
14 | ```
15 |
16 | ## Data Setup ##
17 |
18 |
19 | #### Data in *n*-dim vector space ####
20 |
21 | `clusterFile` parameter in config files should point to the tsv file which contains data with each line in following format:
22 |
23 | ` .... `
24 |
25 | #### Data with features defined on every pair of points ####
26 |
27 | `dataDir` parameter in config files should point to data folder which should be present in the following format:
28 | ```bash
29 | ├── NP_Coref
30 | | ├── doc1
31 | | ├── gtClusters.tsv
32 | | ├── pairFearues.tsv
33 | | ├── doc2
34 | | ├── ...
35 | | ├── docn
36 |
37 | ```
38 |
39 | All data should be in a single folder with a separate sub-folder for each canopy or set of points. Each sub-folder contains files: `gtClusters.tsv` and `pairFeatures.tsv`.
40 |
41 | `gtClusters.tsv` contains information about ground-truth clusters for each point in following format:
42 | ` `
43 |
44 | `pairFeatures.tsv` contains feature vector for each pair of points in following format:
45 | ` ... `
46 |
47 | Set of points in each subfolder will be clustered separately.
48 |
49 | ## Run Code ##
50 |
51 | #### For data in *n*-dim vector space ####
52 |
53 | ```bash
54 | cd expLinkage
55 | python src/trainer/train_vect_data.py --config= --seed=
56 | ```
57 |
58 | #### For data with features on every pair of points ####
59 |
60 | ```bash
61 | cd expLinkage
62 | python src/trainer/train_pair_feat.py --config= --seed=
63 | ```
64 |
65 | Config files for all experiments in the paper are present in [config](config) folder.
66 |
67 |
68 | ## Notes ##
69 | - *Code from **xcluster** repository is only used for evaluating dendrogram purity and is not crucial for training as such (if evaluation does not involve computing dendrogram purity or no evaluation on dev set is peformed during training).
70 | - Compatible cuda and pytorch versions:
71 | - cudnn : version 7.6.0, (build: cuda10.0_0)
72 | - pytorch : version 1.2.0 (build cuda100py36h938c94c_0)
73 |
--------------------------------------------------------------------------------
/src/utils/processADANA.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import csv, itertools
17 | from pathlib import Path
18 |
19 | # Read feature vectors from dataDir and write them to outDir after processing, one canopy at a time
20 | def processADANA(dataDir, outDir):
21 | authorList = [str(f) for f in Path(dataDir).glob("*.xml") if f.is_file()]
22 | authorList = [authorFile[:-4] for authorFile in authorList]
23 | print("Author list:{}".format(authorList))
24 | for authorFile in authorList:
25 |
26 | authorName = authorFile.split("/")[-1]
27 | if authorFile.endswith("Wei Wang"):
28 | print("Skipping {} because it does not have {}_ans.txt".format(authorFile,authorFile))
29 | continue
30 |
31 | pairFeatures = {}
32 | pidToCluster = {}
33 |
34 | with open("{}_ans.txt".format(authorFile),"r") as f:
35 | for line in f:
36 | line =line.split()
37 | paperId, clusterId = int(line[0]),int(line[1])
38 | pidToCluster[paperId] = clusterId
39 |
40 | # Initialize pairFeatures to empty list
41 | pidList = sorted(pidToCluster)
42 | for p1,p2 in itertools.combinations(pidList,2):
43 | pairFeatures[(p1,p2)] = []
44 |
45 | with open("{}.txt".format(authorFile),"r") as f:
46 | numPapers = int(f.readline().strip())
47 | for featNum in range(8):
48 | for i in range(numPapers-1):
49 | line = f.readline()
50 | line = [float(x) for x in line.strip().split()]
51 | for j,val in enumerate(line):
52 | pairFeatures[(i,i+j+1)].append(val)
53 |
54 | line = f.readline() # Read empty line between two feature matrices
55 |
56 | print("Writing down data for author:{}".format(authorFile))
57 | Path("{}/{}".format(outDir, authorName)).mkdir(parents=True, exist_ok=True)
58 | with open("{}/{}/gtClusters.tsv".format(outDir, authorName), "w") as f:
59 | for pid in pidToCluster:
60 | f.write("{}\t{}\n".format(pid, pidToCluster[pid]))
61 |
62 | with open("{}/{}/pairFeatures.csv".format(outDir, authorName), "w") as f:
63 | writer = csv.writer(f)
64 | for p1, p2 in pairFeatures:
65 | line = [p1, p2] + pairFeatures[(p1, p2)]
66 | if pidToCluster[p1] == pidToCluster[p2]:
67 | line.append(1)
68 | else:
69 | line.append(0)
70 |
71 | writer.writerow(line)
72 |
73 |
74 | if __name__ == "__main__":
75 |
76 | dataDir = "../data/rich-author-disambiguation-data/experimental-results"
77 | outDir = "../data/authorCoref"
78 | processADANA(dataDir=dataDir, outDir=outDir)
--------------------------------------------------------------------------------
/src/utils/createNPDataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import os
17 | from pathlib import Path
18 | from scipy.sparse import csr_matrix
19 | from scipy.sparse.csgraph import connected_components
20 | from utils.fixNPCorefDataFormat import fixDataFormat
21 |
22 | def createDataset(dataDir,outDir):
23 |
24 | for dataType in ["train","test"]:
25 | processPairwiseData(dataDir, dataType,outDir)
26 |
27 | # dataType= "train" or "test"
28 | def processPairwiseData(dataDir,dataType,outDir):
29 | with open(dataDir+"/"+dataType,"r") as f:
30 | fileList = f.read().split()
31 |
32 | Path(outDir).mkdir(parents=True, exist_ok=True)
33 |
34 |
35 | for file in fileList:
36 | Path("{}/{}".format(outDir, file)).mkdir(parents=True, exist_ok=True)
37 | featureFile = open("{}/{}/pairFeatures.csv".format(outDir, file), "w")
38 | rows, cols, data = [], [], []
39 | uniquePts = {}
40 | with open("{}/{}/features.development/features.arff".format(dataDir, file), "r") as f:
41 | for line in f:
42 | if line.startswith("@"): continue
43 | if len(line.split(",")) < 2: continue
44 |
45 | featureFile.write(line)
46 | lineV = line.strip().split(",")
47 | docNum, id1, id2 = int(lineV[0]), int(lineV[1]), int(lineV[2])
48 |
49 | uniquePts[id1] = 1
50 | uniquePts[id2] = 1
51 | if lineV[-1] == "+":
52 | # Accumulate data to create sparse matrix and then run connected components to retrieve gt clusters
53 | rows += [id1]
54 | cols += [id2]
55 | data += [1]
56 |
57 | rows += [id2]
58 | cols += [id1]
59 | data += [1]
60 | elif lineV[-1] == "-":
61 | pass
62 | else:
63 | print(lineV)
64 | raise Exception("Invalid end token")
65 | featureFile.close()
66 |
67 | numPoints = len(uniquePts)
68 | sparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPoints, numPoints))
69 | connComp = connected_components(sparseMatrix)
70 | if file == "2":
71 | print(file, numPoints, connComp)
72 |
73 | with open("{}/{}/gtClusters.tsv".format(outDir,file),"w") as f:
74 | for id in range(numPoints):
75 | f.write("{}\t{}\n".format(id, connComp[1][id]))
76 |
77 |
78 | if __name__ == '__main__':
79 | tempOutDir = "../data/NP_Coref_temp"
80 | createDataset(dataDir="../data/reconcile/uw-corpus",outDir=tempOutDir)
81 |
82 |
83 | newDir = "../data/NP_Coref"
84 |
85 | # Remove docNum from this temporary dataset
86 | fixDataFormat(origDir=tempOutDir, newDir=newDir)
87 |
88 | os.system("rm -r {}".format(tempOutDir))
--------------------------------------------------------------------------------
/src/utils/projectFaces.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import sys
17 | import numpy as np
18 | from sklearn.decomposition import PCA
19 |
20 | from utils.basic_utils import read_clusters
21 |
22 |
23 | def projectFaces(filename, dim):
24 | clusterData = read_clusters(filename)
25 |
26 | pointList = []
27 | indices = {}
28 | for cid in clusterData:
29 | start = len(pointList)
30 | pointList += clusterData[cid]
31 | end = len(pointList)
32 | indices[cid] = (start, end)
33 |
34 | pointList = [list(point) for point in pointList]
35 | pointList = np.array(pointList)
36 | # print(pointList.shape)
37 |
38 | pca = PCA(n_components=dim, random_state=0)
39 | X_prime = pca.fit_transform(pointList)
40 | print("Explained variance ratio for {} components\t{}\n{}".format(dim,pca.explained_variance_ratio_,sum(pca.explained_variance_ratio_)))
41 |
42 | # print(X_prime.shape)
43 | newClusterData = {}
44 | for cid in clusterData:
45 | start, end = indices[cid]
46 | newClusterData[cid] = X_prime[start:end]
47 | # print(newClusterData[cid].shape)
48 |
49 | with open("../data/faceData_{}.tsv".format(dim), "w") as writer:
50 | pointId = 0
51 | for cid in newClusterData:
52 | for point in newClusterData[cid]:
53 | row = "{}\t{}\t".format(pointId, cid)
54 | row += "\t".join("{:.2f}".format(x) for x in point)
55 | # print(row)
56 | writer.write(row + "\n")
57 | pointId += 1
58 |
59 |
60 | def normalizeFaces(filename):
61 | clusterData = read_clusters(filename)
62 |
63 | pointList = []
64 | indices = {}
65 | for cid in clusterData:
66 | start = len(pointList)
67 | pointList += clusterData[cid]
68 | end = len(pointList)
69 | indices[cid] = (start, end)
70 |
71 |
72 | maxVal = 0.
73 | for cid in clusterData:
74 | for point in clusterData[cid]:
75 | tempMax = np.max([abs(x) for x in point])
76 | maxVal = max(tempMax, maxVal)
77 |
78 | maxVal = 100
79 | newFilename = filename[:-4] + "_norm_10.tsv"
80 | with open(newFilename, "w") as writer:
81 | pointId = 0
82 | for cid in clusterData:
83 | for point in clusterData[cid]:
84 | row = "{}\t{}\t".format(pointId, cid)
85 | # Z = sum(point)
86 | Z = np.linalg.norm(point)
87 | origPoint = point
88 | point = [x/maxVal for x in point]
89 | # point = [x/Z for x in point]
90 | row += "\t".join("{:.2f}".format(x) for x in point)
91 | # row += "\t".join("{:.2f}".format(x) for x in origPoint)
92 | print(row)
93 | writer.write(row + "\n")
94 | pointId += 1
95 |
96 | print(maxVal)
97 |
98 | if __name__ == "__main__":
99 |
100 | dim = int(sys.argv[1])
101 | projectFaces("../data/faceData.tsv",dim)
102 |
--------------------------------------------------------------------------------
/src/utils/processRexa.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import csv
17 | from pathlib import Path
18 | from scipy.sparse import csr_matrix
19 | from scipy.sparse.csgraph import connected_components
20 |
21 | # Read feature vectors from dataDir and write them to outDir after processing, one canopy at a time
22 | def processRexa(dataDir, outDir):
23 |
24 | folderList = [str(f) for f in Path(dataDir).glob("*") if f.is_dir()]
25 |
26 | for ctr, folder in enumerate(folderList):
27 | canopyId = folder.split("/")[-1]
28 | pairFeatures = {}
29 | mentToId = {}
30 | pidToCluster = {}
31 | rows,cols,data = [],[],[]
32 | with open("{}/pair_vecs.tsv".format(folder),"r") as f:
33 | reader = csv.reader(f,delimiter="\t")
34 | for line in reader:
35 | m1, m2 = line[0], line[1]
36 | featureVec = line[3:-1]
37 |
38 | pairFeatures[(m1, m2)] = featureVec
39 | mentToId[m1] = 1
40 | mentToId[m2] = 1
41 | if line[2] == "1":
42 | # Accumulate data to create sparse matrix and then run connected components to retrieve gt clusters
43 | rows += [m1]
44 | cols += [m2]
45 | data += [1]
46 |
47 | rows += [m2]
48 | cols += [m1]
49 | data += [1]
50 | elif line[2] == "0":
51 | pass
52 | else:
53 | print(line[2])
54 | raise Exception("Invalid end token")
55 |
56 | mentToId = {ment:ctr for ctr,ment in enumerate(mentToId)} # Assign unique id to each point
57 |
58 | # Find out ground-truth cluster after running connected components
59 | rows = [mentToId[ment] for ment in rows]
60 | cols = [mentToId[ment] for ment in cols]
61 | numPoints = len(mentToId)
62 | sparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPoints, numPoints))
63 | connComp = connected_components(sparseMatrix)
64 |
65 | for pid in range(numPoints):
66 | pidToCluster[pid] = connComp[1][pid]
67 |
68 | Path("{}/{}".format(outDir, canopyId)).mkdir(parents=True, exist_ok=True)
69 | with open("{}/{}/gtClusters.tsv".format(outDir, canopyId), "w") as f:
70 | for pid in pidToCluster:
71 | f.write("{}\t{}\n".format(pid, pidToCluster[pid]))
72 |
73 | with open("{}/{}/pairFeatures.csv".format(outDir, canopyId), "w") as f:
74 | writer = csv.writer(f)
75 | for m1,m2 in pairFeatures:
76 | line = [ mentToId[m1], mentToId[m2] ] + pairFeatures[(m1,m2)]
77 |
78 | if pidToCluster[mentToId[m1]] == pidToCluster[mentToId[m2]]:
79 | line.append(1)
80 | else:
81 | line.append(0)
82 |
83 | writer.writerow(line)
84 |
85 | if __name__ == "__main__":
86 |
87 | # dataDir = "../data/data/rexa/canopy"
88 | dataDir = "../data/data_rexa_all/nick-rexa/rexa/canopy"
89 | outDir = "../data/rexa_new"
90 | processRexa(dataDir=dataDir, outDir=outDir)
--------------------------------------------------------------------------------
/src/trainer/BaseTrainer.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | from utils.Config import Config
17 | import torch
18 | import os
19 |
20 | class BaseTrainer(object):
21 | """docstring for Base Trainer Class"""
22 |
23 | def __init__(self, config):
24 | super(BaseTrainer, self).__init__()
25 |
26 | assert isinstance(config, Config)
27 | self.config = config
28 | self.logger = None
29 | self.optimizer = None
30 | self.trainCanopies = {}
31 | self.testCanopies = {}
32 | self.devCanopies = {}
33 |
34 | def __str__(self):
35 | return "Base Trainer Class"
36 |
37 | def train(self):
38 | raise NotImplementedError
39 |
40 | def loadModel(self):
41 | # Load model and reset optimizer to have parameters of the loaded model
42 | if os.path.isfile(self.config.bestModel):
43 | self.model = torch.load(self.config.bestModel)
44 | self.logger.info("Loading model from:{}".format(self.config.bestModel))
45 | else:
46 | bestModel = os.path.join(self.config.resultDir, self.config.bestModel)
47 | if os.path.isfile(bestModel):
48 | self.model = torch.load(bestModel)
49 | self.logger.info("Loading model from:{}".format(bestModel))
50 | else:
51 | try:
52 | bestModel = os.path.join(self.config.resultDir, "model_alpha.torch")
53 | self.model = torch.load(bestModel)
54 | self.logger.info("Loading model from:{}".format(bestModel))
55 | except:
56 | bestModel = os.path.join(self.config.resultDir, "model.torch")
57 | self.model = torch.load(bestModel)
58 | self.logger.info("Loading model from:{}".format(bestModel))
59 |
60 | self.resetOptimizer()
61 |
62 | def resetOptimizer(self):
63 |
64 | if self.config.trainObj == "linkage_auto":
65 | assert self.config.trainModel and self.config.trainAlpha
66 |
67 | if self.config.trainModel and self.config.trainAlpha : # Add model.seqModel parameters and linkAlpha to the optimizer
68 | assert self.config.trainObj == "linkage_auto"
69 | self.optimizer = torch.optim.Adam([{'params': self.model.seqModel.parameters(), 'lr': self.config.lr, 'weight_decay': self.config.l2Alpha}])
70 | self.optimizer.add_param_group({'params': self.model.linkAlpha, 'lr': self.config.alphaLr})
71 |
72 | elif (not self.config.trainModel) and self.config.trainAlpha: # Add linkAlpha to the optimizer
73 | self.optimizer = torch.optim.Adam([{'params': self.model.linkAlpha, "lr": self.config.alphaLr}])
74 |
75 | elif self.config.trainModel and (not self.config.trainAlpha): # Add model.seqModel parameters to optimizer
76 | assert self.config.trainObj != "linkage_auto"
77 | self.optimizer = torch.optim.Adam([{'params': self.model.seqModel.parameters(), 'lr': self.config.lr, 'weight_decay': self.config.l2Alpha}])
78 |
79 | else:
80 | self.optimizer = torch.optim.Adam()
81 |
82 |
--------------------------------------------------------------------------------
/src/eval/evalF1.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import time
17 | import os
18 | import itertools
19 |
20 | # y_true & y_pred is list of labels for each point. Recall, Precision, F1 is for predicted edges on underlying points
21 | def comp_prec_rec_f1_fast(y_true, y_pred):
22 | assert (len(y_true) == len(y_pred))
23 | t1 = time.time()
24 | with open("predicted.tsv", "w") as predicted:
25 | for id, val in enumerate(y_pred):
26 | predicted.write(str(id) + "\t" + str(val) + "\n")
27 |
28 | with open("goldFile.tsv", "w") as goldFile:
29 | for id, val in enumerate(y_true):
30 | goldFile.write(str(id) + "\t" + str(val) + "\n")
31 |
32 | filenum = time.time()
33 | command = "cd $XCLUSTER_ROOT && source bin/setup.sh &&"
34 | command += "sh bin/util/score_pairwise.sh ../singleLinkage/predicted.tsv ../singleLinkage/goldFile.tsv algo data None > tempResult_{}".format(filenum)
35 | print("executing command::\n{}\n".format(command))
36 | os.system(command)
37 | precision, recall, f1 = 0, 1, 0
38 | XCLUSTER_ROOT = os.getenv("XCLUSTER_ROOT")
39 |
40 | with open("{}/tempResult_{}".format(XCLUSTER_ROOT,filenum), "r") as results:
41 | for line in results:
42 | algo, data, precision, recall, f1 = line.split()
43 | precision = float(precision)
44 | recall = float(recall)
45 | f1 = float(f1)
46 |
47 | command = "rm {}/tempResult_{}".format(XCLUSTER_ROOT, filenum)
48 | print("executing command::\n{}\n".format(command))
49 | os.system(command)
50 | t2 = time.time()
51 | print("Time taken = {:.3f}".format(t2 - t1))
52 | return {"precision": precision, "recall": recall, "f1": f1}
53 |
54 | # y_true & y_pred is list of labels for each point. Recall, Precision, F1 is for predicted edges on underlying points
55 | def comp_prec_rec_f1(y_true, y_pred): # TODO Optimize this, we do not need to calculate trueNeg and that is a large fraction of all edges
56 | assert (len(y_true) == len(y_pred))
57 | truePos = 0
58 | falseNeg = 0
59 |
60 | trueNeg = 0
61 | falsePos = 0
62 | numPoints = len(y_true)
63 |
64 | for pid1, pid2 in itertools.combinations(range(numPoints), 2):
65 | if y_pred[pid1] == y_pred[pid2]:
66 | if y_true[pid1] == y_true[pid2]:
67 | truePos += 1 # TP
68 | else:
69 | falsePos += 1 # FP
70 | else:
71 | if y_true[pid1] == y_true[pid2]:
72 | falseNeg += 1 # FN
73 | else:
74 | trueNeg += 1 # TN
75 |
76 | precision = truePos / (truePos + falsePos) if (truePos + falsePos) > 0 else 1.
77 | recall = truePos / (truePos + falseNeg) if (truePos + falseNeg) > 0 else 1.
78 | f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0. else 0.
79 |
80 | return {"precision": precision, "recall": recall, "f1": f1,
81 | "recall_num":truePos, "recall_den":truePos + falseNeg,
82 | "precision_num": truePos, "precision_den": truePos + falsePos}
83 |
--------------------------------------------------------------------------------
/src/eval/evalDendPurity.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import os
17 | import time
18 |
19 | def write_tree(children_, Y, X_labels, filename):
20 | """
21 |
22 | The children of each non-leaf node. Values less than n_samples correspond to leaves of the tree which are the
23 | original samples. A node i greater than or equal to n_samples is a non-leaf node and has children
24 | children_[i - n_samples]. Alternatively at the i-th iteration, children[i][0] and children[i][1]
25 | are merged to form node n_samples + i
26 |
27 |
28 |
29 | Args:
30 | children_:
31 | Y:
32 | fn:
33 |
34 | Returns:
35 |
36 | """
37 | num_samples = len(Y)
38 | with open(filename, 'w') as fout:
39 | for i in range(0, len(children_)):
40 | node_i_id = "id_" + str(i + num_samples)
41 |
42 | if children_[i][0] < num_samples:
43 | child_0_node_id = str(X_labels[int(children_[i][0])])
44 | child_0_label = str(Y[int(children_[i][0])])
45 | else:
46 | child_0_node_id = "id_" + str(int(children_[i][0]))
47 | child_0_label = "None"
48 |
49 | if children_[i][1] < num_samples:
50 | child_1_node_id = str(X_labels[int(children_[i][1])])
51 | child_1_label = str(Y[int(children_[i][1])])
52 | else:
53 | child_1_node_id = "id_" + str(int(children_[i][1]))
54 | child_1_label = "None"
55 |
56 | fout.write("{}\t{}\t{}\n".format(child_0_node_id, node_i_id, child_0_label))
57 | fout.write("{}\t{}\t{}\n".format(child_1_node_id, node_i_id, child_1_label))
58 | root_ = "id_" + str(len(children_) + num_samples - 1)
59 | fout.write("{}\tNone\tNone\n".format(root_))
60 |
61 | def calc_dend_purity(linkTree, pidList, y_true):
62 | dendPurity = 0
63 | XCLUSTER_ROOT = os.getenv("XCLUSTER_ROOT")
64 | filenum = time.time()
65 | treeFileName = "{}/perchTree_{}.tree".format(XCLUSTER_ROOT, filenum)
66 |
67 | while os.path.isfile(treeFileName):
68 | filenum = time.time()
69 | treeFileName = "{}/perchTree_{}.tree".format(XCLUSTER_ROOT, filenum)
70 |
71 |
72 | if isinstance(linkTree, str): # If linkTree is already a formatted string then just write it
73 | with open(treeFileName, "w") as f:
74 | f.write(linkTree)
75 | else:
76 | write_tree(linkTree, y_true, pidList, treeFileName)
77 |
78 |
79 | assert os.path.isfile(treeFileName)
80 |
81 | command = "cd $XCLUSTER_ROOT && source bin/setup.sh && pwd && "
82 | command += "sh bin/util/score_tree.sh {} algo data 24 None > treeResult_{}".format(treeFileName, filenum)
83 | os.system(command)
84 |
85 | resultFileName = "{}/treeResult_{}".format(XCLUSTER_ROOT, filenum)
86 | with open(resultFileName, "r") as reader:
87 | for line in reader:
88 | algo, data, dendPurity = line.split()
89 | dendPurity = float(dendPurity)
90 | break
91 |
92 | command = "rm {} && rm {}".format(treeFileName, resultFileName)
93 | # print("Removing files:{}".format(command))
94 | os.system(command)
95 | assert not os.path.isfile(treeFileName)
96 | assert not os.path.isfile(resultFileName)
97 | return dendPurity
98 |
--------------------------------------------------------------------------------
/src/eval/evalMUCF1.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import itertools
17 | from scipy.sparse import csr_matrix
18 | from scipy.sparse.csgraph import connected_components
19 |
20 | def calc_muc_score(pidToCluster_true, pidToCluster_pred):
21 | # For each predicted cluster, accumulate points in that cluster
22 | predClusterToPids = {}
23 | for idx, predCluster in enumerate(pidToCluster_pred):
24 | try:
25 | predClusterToPids[predCluster].append(idx)
26 | except:
27 | predClusterToPids[predCluster] = [idx]
28 |
29 | precNumerator, precDenominator = 0, 0
30 | for predCid in predClusterToPids: # Compute precision for each predicted cluster. Find connected component in each predicted cluster
31 | pidList = predClusterToPids[predCid]
32 | if len(pidList) <= 1:
33 | continue
34 |
35 | data, rows, cols = [], [], []
36 | for p1, p2 in itertools.combinations(pidList, 2):
37 | if pidToCluster_true[p1] == pidToCluster_true[p2]:
38 | data += [1]
39 | rows += [p1]
40 | cols += [p2]
41 |
42 | data += [1]
43 | rows += [p2]
44 | cols += [p1]
45 | idMapping = {p: idx for idx, p in enumerate(pidList)}
46 | rows = [idMapping[p] for p in rows]
47 | cols = [idMapping[p] for p in cols]
48 | numPointInCluster = len(pidList)
49 | predClusterSparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPointInCluster, numPointInCluster))
50 |
51 | numConnComp = connected_components(predClusterSparseMatrix)[0]
52 | precNumerator += numPointInCluster - numConnComp
53 | precDenominator += numPointInCluster - 1
54 | # print("Points in predCluster:{}\t{}\n{}/{}".format(predCid, pidList, numPointInCluster - numConnComp, numPointInCluster-1))
55 |
56 | precision = precNumerator / precDenominator if precDenominator > 0 else 1
57 |
58 | trueClusterToPids = {}
59 | for idx, trueCluster in enumerate(pidToCluster_true):
60 | try:
61 | trueClusterToPids[trueCluster].append(idx)
62 | except:
63 | trueClusterToPids[trueCluster] = [idx]
64 |
65 | recallNumerator, recallDenominator = 0, 0
66 | for trueCid in trueClusterToPids:
67 | pidList = trueClusterToPids[trueCid]
68 | if len(pidList) <= 1:
69 | continue
70 |
71 | data, rows, cols = [], [], []
72 | for p1, p2 in itertools.combinations(pidList, 2):
73 | if pidToCluster_pred[p1] == pidToCluster_pred[p2]:
74 | data += [1]
75 | rows += [p1]
76 | cols += [p2]
77 |
78 | data += [1]
79 | rows += [p2]
80 | cols += [p1]
81 |
82 | idMapping = {p: idx for idx, p in enumerate(pidList)}
83 | rows = [idMapping[p] for p in rows]
84 | cols = [idMapping[p] for p in cols]
85 | numPointInCluster = len(pidList)
86 |
87 | trueClusterSparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPointInCluster, numPointInCluster))
88 | numConnComp = connected_components(trueClusterSparseMatrix)[0]
89 | recallNumerator += numPointInCluster - numConnComp
90 | recallDenominator += numPointInCluster - 1
91 | # print("Points in trueCluster:{}\t{}\n{}/{}".format(trueCid, pidList, numPointInCluster - numConnComp, numPointInCluster - 1))
92 |
93 | recall = recallNumerator / recallDenominator if recallDenominator > 0 else 1
94 | f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
95 |
96 | return {"muc_precision": precision, "muc_recall": recall, "muc_f1": f1,
97 | "muc_precision_num": precNumerator, "muc_precision_den": precDenominator,
98 | "muc_recall_num": recallNumerator, "muc_recall_den": recallDenominator}
99 |
--------------------------------------------------------------------------------
/src/utils/Config.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import json
17 | import random
18 | import os
19 | import numpy as np
20 | import torch
21 |
22 | class Config(object):
23 | def __init__(self,filename=None):
24 |
25 | self.config_name = filename
26 |
27 | self.cuda = True
28 | self.useGPU = self.cuda and torch.cuda.is_available()
29 | self.seed = 1234
30 |
31 |
32 | self.mode = "train"
33 | self.resultDir = "auto"
34 | self.newDirSuffix = ""
35 |
36 | self.clusterFile = ""
37 | self.dataDir = ""
38 | self.logFile = "logFile.txt"
39 | self.bestModel = ""
40 | self.logConsole = True
41 |
42 | # Training Specific
43 | self.trainObj = ""
44 | self.threshold = 0.
45 | self.margin = 2.
46 | self.normalizeLoss = False # Normalize loss for training methods other than those starting with "linkage"
47 | self.normExpLinkLoss = True # Normalize loss for training methods starting with "linkage"
48 | self.trainExpLink = False
49 | self.scaleDist = False # Used with VectDataTrainer and ExpLink
50 | self.numErrorTriplet = 1
51 |
52 | self.numEpoch = 100
53 | self.numEpToAvg = 10
54 | self.epochToEval = 1000
55 | self.epochToWrite = 1000
56 | self.epsilon = 0.001
57 | self.makeScorePlots = True
58 | self.evalBeforeTrain = False
59 | self.evalOnTrainThresh = False
60 | self.evalOnTestThresh = False
61 |
62 | self.trainFrac = 0.6
63 | self.testFrac = 0.3
64 | self.devFrac = 0.1
65 | self.shuffleData = True
66 |
67 | # Eval Specific
68 | self.inferenceMethods = ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t"]
69 | self.metricsForEval = ["f1", "randIndex", "dendPurity"]
70 |
71 | # Scoring Model Specific Parameters
72 | self.modelType = ""
73 | self.inputDim = 1 # Dataset specific
74 | self.outDisSim = True
75 | self.lr = 0.01
76 | self.l2Alpha = 0.01
77 | self.alphaLr = 0.01
78 | self.alphaInitMu = 0.
79 | self.alphaInitSigma = 0.01
80 | self.trainAlpha = True
81 | self.trainModel = True
82 | self.idenInit = False # Useful for Mahalanobis distance learner only
83 |
84 |
85 |
86 | if filename is not None:
87 | self.__dict__.update(json.load(open(filename)))
88 |
89 | # REDO Following three steps after updating any important parameter in config object
90 | self.useGPU = self.cuda and torch.cuda.is_available()
91 | self.updateRandomSeeds(self.seed)
92 | self.updateResultDir(self.resultDir)
93 |
94 | def to_json(self):
95 | return json.dumps(filter_json(self.__dict__),indent=4,sort_keys=True)
96 |
97 | def save_config(self, exp_dir, filename='config.json'):
98 | with open(os.path.join(exp_dir, filename), 'w') as fout:
99 | fout.write(self.to_json())
100 | fout.write('\n')
101 |
102 | def __getstate__(self):
103 | state = dict(self.__dict__)
104 | if "logger" in state:
105 | del state['logger']
106 |
107 | return state
108 |
109 | def updateResultDir(self, newResultDir):
110 |
111 | if newResultDir.startswith("auto"):
112 | miscInfo = newResultDir[4:]
113 | dataType = self.dataDir.split("/")[-1]
114 | self.resultDir = "{base}/d={d}/obj={obj}_s={s}{misc}".format(
115 | base="../results_refactor",
116 | d=dataType,
117 | obj=self.trainObj,
118 | s=self.seed,
119 | misc=miscInfo)
120 | else:
121 | self.resultDir = newResultDir
122 |
123 |
124 | def updateRandomSeeds(self, random_seed):
125 |
126 | self.seed = random_seed
127 | random.seed(random_seed)
128 |
129 | self.torch_seed = random.randint(0, 1000)
130 | self.np_seed = random.randint(0, 1000)
131 | self.cuda_seed = random.randint(0, 1000)
132 |
133 | torch.manual_seed(self.torch_seed)
134 | np.random.seed(self.np_seed)
135 | if self.useGPU and torch.cuda.is_available():
136 | torch.cuda.manual_seed(self.cuda_seed)
137 |
138 |
139 | def filter_json(the_dict):
140 | res = {}
141 | for k in the_dict.keys():
142 | if type(the_dict[k]) is str or \
143 | type(the_dict[k]) is float or \
144 | type(the_dict[k]) is int or \
145 | type(the_dict[k]) is list or \
146 | type(the_dict[k]) is bool or \
147 | the_dict[k] is None:
148 | res[k] = the_dict[k]
149 | elif type(the_dict[k]) is dict:
150 | res[k] = filter_json(the_dict[k])
151 | return res
152 |
--------------------------------------------------------------------------------
/src/models/linearClassifier.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import torch
17 | from torch.autograd import Variable
18 | import numpy as np
19 | from utils.Config import Config
20 |
21 | class LinearClassifier(torch.nn.Module):
22 | """docstring for Linear Classifier"""
23 |
24 | def __init__(self, config):
25 | super(LinearClassifier, self).__init__()
26 | assert isinstance(config, Config)
27 | self.config = config
28 | self.inputDim = config.inputDim # Dimension of vector for each point
29 | self.outputDim = 1
30 |
31 | self.seqModel = torch.nn.Sequential(
32 | torch.nn.Linear(self.inputDim,self.outputDim)
33 | )
34 |
35 | tempAlphaVal = np.random.normal(self.config.alphaInitMu, self.config.alphaInitSigma, 1)[0]
36 | if self.config.useGPU:
37 | self.linkAlpha = Variable(torch.cuda.FloatTensor([tempAlphaVal]), requires_grad=True)
38 | else:
39 | self.linkAlpha = Variable(torch.FloatTensor([tempAlphaVal]), requires_grad=True)
40 |
41 |
42 | def __str__(self):
43 | printStr = ""
44 | printStr += "-----------------Linear Classifier Parameters----------------------" + "\n"
45 | printStr += "linkAlpha:" + str(self.linkAlpha) + "\n"
46 | printStr += "inputDim::" + str(self.inputDim) + "\n"
47 | printStr += "output dissimilarity\t" + str(self.config.outDisSim) + "\n"
48 | printStr += "Layers::" + str(self.seqModel) + "\n"
49 | printStr += self.getWeightStr()
50 |
51 | printStr += "-------------------------------------------------------------------"
52 | return printStr
53 |
54 | def getWeightStr(self):
55 | weightStr = ""
56 | weightStr += "Weight::{}".format(self.seqModel[0].weight) + "\n"
57 | weightStr += "Bias::{}".format(self.seqModel[0].bias) + "\n"
58 | return weightStr
59 |
60 | def pairForward(self,pairFeature):
61 | if self.config.useGPU:
62 | pairFeature = Variable(torch.cuda.FloatTensor(pairFeature))
63 | else:
64 | pairFeature = Variable(torch.Tensor(pairFeature))
65 |
66 | prediction = self.seqModel(pairFeature)
67 | return prediction
68 |
69 | def pairBatchForward(self,pairFeatureList):
70 | if self.config.useGPU:
71 | pairFeatureList = Variable(torch.cuda.FloatTensor(pairFeatureList))
72 | else:
73 | pairFeatureList = Variable(torch.Tensor(pairFeatureList))
74 |
75 | prediction = self.seqModel(pairFeatureList)
76 | return prediction
77 |
78 | class AvgLinearClassifier(LinearClassifier):
79 |
80 | def __init__(self, config):
81 | super(AvgLinearClassifier, self).__init__(config)
82 | biasPresent = self.seqModel[0].bias is not None
83 | self.updateNum = 0
84 | self.avgWeights = torch.nn.Linear(self.inputDim, self.outputDim, bias=biasPresent)
85 |
86 | def __str__(self):
87 | printStr = ""
88 | printStr += "-----------------Average Linear Classifier Parameters-----------------------------" + "\n"
89 | printStr += "linkAlpha::\t" + str(self.linkAlpha) + "\n"
90 | printStr += "inputDim::\t" + str(self.inputDim) + "\n"
91 | printStr += "output dissimilarity\t" + str(self.config.outDisSim) + "\n"
92 | printStr += "updateNum" + str(self.updateNum) + "\n"
93 | printStr += "Layers::" + str(self.seqModel) + "\n"
94 | printStr += self.getWeightStr()
95 | printStr += "-------------------------------------------------------------------"
96 | return printStr
97 |
98 | def getWeightStr(self):
99 | weightStr = ""
100 | weightStr += "Weight::{}".format(self.seqModel[0].weight) + "\n"
101 | weightStr += "Bias::{}".format(self.seqModel[0].bias) + "\n"
102 |
103 | weightStr += "Avg Weight::{}".format(self.avgWeights.weight.data) + "\n"
104 | weightStr += "Avg Bias::{}".format(self.avgWeights.bias.data)+ "\n"
105 | return weightStr
106 |
107 | # Average weights after making gradient update
108 | def updateAvgWeights(self):
109 |
110 | self.avgWeights.weight.data = self.updateNum * self.avgWeights.weight.data + self.seqModel[0].weight.data
111 | if self.avgWeights.bias is not None:
112 | self.avgWeights.bias.data = self.updateNum * self.avgWeights.bias.data + self.seqModel[0].bias.data
113 |
114 | self.updateNum += 1
115 | self.avgWeights.weight.data = self.avgWeights.weight.data / self.updateNum
116 | if self.avgWeights.bias is not None:
117 | self.avgWeights.bias.data = self.avgWeights.bias.data / self.updateNum
118 |
119 | def pairAvgBatchForward(self, pairFeatureList):
120 | if self.config.useGPU:
121 | pairFeatureList = Variable(torch.cuda.FloatTensor(pairFeatureList))
122 | else:
123 | pairFeatureList = Variable(torch.Tensor(pairFeatureList))
124 |
125 | prediction = self.avgWeights(pairFeatureList)
126 | return prediction
127 |
128 | def pairAvgForward(self,pairFeature):
129 | if self.config.useGPU:
130 | pairFeature = Variable(torch.cuda.FloatTensor(pairFeature))
131 | else:
132 | pairFeature = Variable(torch.Tensor(pairFeature))
133 |
134 | prediction = self.avgWeights(pairFeature)
135 | return prediction
136 |
137 |
--------------------------------------------------------------------------------
/src/hier_clust/recursive_sparsest_cut.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 |
17 | """Run hierarchical sparsest cut."""
18 | import argparse
19 | import datetime
20 | import numpy as np
21 | import uuid
22 | import os
23 | import sys
24 |
25 | from itertools import combinations
26 |
27 | from sklearn.cluster import SpectralClustering
28 |
29 | def log_exp_minus_dist(x, y):
30 | # return -((x - y).norm(2, 1)).unsqueeze(1)
31 | return np.linalg.norm(-(x - y))
32 |
33 |
34 | def log_1_by_1p_dist(x, y):
35 | # return - torch.log1p(np.sqrt((x - y).norm(2, 1))).unsqueeze(1)
36 | return - np.log1p(np.linalg.norm(np.sqrt(x - y)))
37 |
38 | def sparsest_cut(sims):
39 | if len(sims) == 2:
40 | return [0],[1]
41 | else:
42 | spectral = SpectralClustering(n_clusters=2,n_jobs=-1,affinity='precomputed')
43 | labels = spectral.fit_predict(sims)
44 | # print("SC gives: ")
45 | # print(labels)
46 | left = np.where(labels==0)[0].astype(np.int)
47 | # print("left")
48 | # print(left)
49 | right = np.where(labels==1)[0].astype(np.int)
50 | # print("right")
51 | # print(right)
52 | return left,right
53 |
54 | def run(sim_file,label_file,out_file):
55 | sims = np.load(sim_file)
56 | labels = np.load(label_file)
57 |
58 | # (Node id, parent id, label, mat, objs)
59 | output = ''
60 | frontier = [(uuid.uuid4(), 'None', 'None', sims, np.arange(labels.shape[0]))]
61 | num_done = 0
62 | while frontier:
63 | # print("Splits on frontier: {}. Completed {}".format(len(frontier), num_done))
64 | nid, pid, label, mat, obs = frontier.pop(0)
65 | output += '%s\t%s\t%s\n' % (nid, pid, label)
66 | if obs.shape[0] > 1:
67 | l, r = sparsest_cut(mat)
68 | # Sometimes, this sparsest cut will not split the nodes. If this is
69 | # the case, we need to manually split them.
70 | if np.size(l) == 0:
71 | raise Exception('bad case...')
72 | l = [0]
73 | r = list(range(1, len(obs)))
74 | if np.size(r) == 0:
75 | raise Exception('bad case...')
76 | r = [0]
77 | l = list(range(1, len(obs)))
78 |
79 | if np.size(l) > 1:
80 | l_nid = uuid.uuid4()
81 | l_label = 'None'
82 | else:
83 | assert (np.size(l) == 1)
84 | l_nid = obs[l[0]]
85 | l_label = labels[obs[l[0]]]
86 |
87 | if np.size(r) > 1:
88 | r_nid = uuid.uuid4()
89 | r_label = 'None'
90 | else:
91 | assert (np.size(r) == 1)
92 | r_nid = obs[r[0]]
93 | r_label = labels[obs[r[0]]]
94 |
95 | # print(obs)
96 | l_obs = np.array([obs[i] for i in l])
97 | # print(l_obs)
98 | r_obs = np.array([obs[i] for i in r])
99 | # print(r_obs)
100 | frontier.append((l_nid, nid, l_label, mat[l, :][:, l], l_obs))
101 | frontier.append((r_nid, nid, r_label, mat[r, :][:, r], r_obs))
102 | # print(num_done)
103 |
104 | now = datetime.datetime.now()
105 | ts = "{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}".format(now.year, now.month,
106 | now.day, now.hour,
107 | now.minute,
108 | now.second)
109 | out_dir = os.path.basename(out_file)
110 | if not os.path.exists(out_dir):
111 | os.makedirs(out_dir)
112 |
113 | with open(out_file, 'w') as fout:
114 | fout.write(output)
115 |
116 |
117 | def run_sparsest_cut(sims, labels ):
118 |
119 | # sims = np.load(sim_file)
120 | from scipy.spatial.distance import cdist
121 | # sims = cdist(transformedPointList,transformedPointList)
122 | # labels = np.array([pidToGtCluster[i] for i in range(len(pidToGtCluster))])
123 |
124 | # (Node id, parent id, label, mat, objs)
125 | output = ''
126 | frontier = [(uuid.uuid4(), 'None', 'None', sims, np.arange(labels.shape[0]))]
127 | num_done = 0
128 | while frontier:
129 | # print("Splits on frontier: {}. Completed {}".format(len(frontier), num_done))
130 | nid, pid, label, mat, obs = frontier.pop(0)
131 | output += '%s\t%s\t%s\n' % (nid, pid, label)
132 | if obs.shape[0] > 1:
133 | l, r = sparsest_cut(mat)
134 | # Sometimes, this sparsest cut will not split the nodes. If this is
135 | # the case, we need to manually split them.
136 | if np.size(l) == 0:
137 | raise Exception('bad case...')
138 | l = [0]
139 | r = list(range(1, len(obs)))
140 | if np.size(r) == 0:
141 | raise Exception('bad case...')
142 | r = [0]
143 | l = list(range(1, len(obs)))
144 |
145 | if np.size(l) > 1:
146 | l_nid = uuid.uuid4()
147 | l_label = 'None'
148 | else:
149 | assert (np.size(l) == 1)
150 | l_nid = obs[l[0]]
151 | l_label = labels[obs[l[0]]]
152 |
153 | if np.size(r) > 1:
154 | r_nid = uuid.uuid4()
155 | r_label = 'None'
156 | else:
157 | assert (np.size(r) == 1)
158 | r_nid = obs[r[0]]
159 | r_label = labels[obs[r[0]]]
160 |
161 | # print(obs)
162 | l_obs = np.array([obs[i] for i in l])
163 | # print(l_obs)
164 | r_obs = np.array([obs[i] for i in r])
165 | # print(r_obs)
166 | frontier.append((l_nid, nid, l_label, mat[l, :][:, l], l_obs))
167 | frontier.append((r_nid, nid, r_label, mat[r, :][:, r], r_obs))
168 | # print(num_done)
169 |
170 | return output
171 |
172 |
173 | if __name__ == '__main__':
174 | run(sys.argv[1], sys.argv[2], sys.argv[3])
175 |
--------------------------------------------------------------------------------
/src/utils/create_synth_dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import csv, os, argparse, itertools, math
17 | from pathlib import Path
18 |
19 | from utils.basic_utils import read_canopy_data, read_clusters, read_clusters_synth
20 | from utils.plotting import plot_clusters, plot_clusters_w_edges
21 |
22 | def rotate(point,theta,anchor=(0,0)):
23 | point = point[0]-anchor[0],point[1]-anchor[1]
24 | point = math.cos(theta) * point[0] - math.sin(theta) * point[1], math.sin(theta) * point[0] + math.cos(theta) * point[1]
25 | point = point[0] + anchor[0], point[1] + anchor[1]
26 | return point
27 |
28 | # Reads clusters from file=filename and creates pairwise features for these clusters and stores them in dataDir
29 | def create_pairwise_data(filename, dataDir, squared=False):
30 | clusters = read_clusters_synth(filename)
31 |
32 | pointData = {} # Maps each pointId to (point,cid) tuple
33 | pid = 0
34 | for cid in clusters:
35 | for point in clusters[cid]:
36 | pointData[pid] = (point,cid)
37 | pid+=1
38 |
39 | Path(dataDir).mkdir(exist_ok=True, parents=True)
40 | with open("{}/gtClusters.tsv".format(dataDir),"w") as f:
41 | csvWriter = csv.writer(f, delimiter=" ")
42 | for pid in pointData.keys():
43 | row = [pid,pointData[pid][1]]
44 | csvWriter.writerow(row)
45 |
46 | with open("{}/pairFeatures.csv".format(dataDir),"w") as f:
47 | csvWriter = csv.writer(f)
48 | for pid1, pid2 in itertools.combinations(pointData.keys(),2):
49 |
50 | featureVec = [abs(x1-x2) for x1,x2 in zip(pointData[pid1][0], pointData[pid2][0])]
51 | if squared:
52 | featureVec = [x**2 for x in featureVec]
53 |
54 | row = [pid1, pid2] + featureVec
55 | if pointData[pid1][1] == pointData[pid2][1]:
56 | row.append(1)
57 | else:
58 | row.append(0)
59 |
60 | csvWriter.writerow(row)
61 |
62 | # Reads clusters from file=filename and creates pairwise features for these clusters and stores them in dataDir
63 | # This one is specially written for generating different datasets for spiral clusters
64 | def create_pairwise_spiral(filename, dataDir, squared=False, theta=0., trimAt=None, pushMinValTo=None):
65 | clusters = read_clusters_synth(filename)
66 |
67 | pointData = {} # Maps each pointId to (point,cid) tuple
68 | pointToCluster = {} # Maps each point to its cluster
69 | pntCtr,pid = 0,0
70 | for cid in clusters:
71 | if cid == 2: continue
72 | # np.random.shuffle(clusters[cid])
73 | for point in clusters[cid][40:90]:
74 | pntCtr += 1
75 | if pntCtr % 5 != 0: continue
76 | if cid == 1:
77 | newPoint = rotate(point, theta, (16, 15))
78 | else:
79 | newPoint = point
80 |
81 | pointData[pid] = (newPoint, cid)
82 | pointToCluster[newPoint] = cid
83 | pid += 1
84 |
85 | Path(dataDir).mkdir(exist_ok=True, parents=True)
86 | plot_clusters(pointToCluster, dataDir + "/origData_{:.2f}.png".format(theta))
87 | with open(dataDir+"/orig2D.txt","w") as writer:
88 | for point in pointToCluster:
89 | writer.write("{}\t{}\t{}\n".format(point[0],point[1],pointToCluster[point]))
90 |
91 | with open(dataDir+"/pidToPoint.txt","w") as writer:
92 | for pid in pointData:
93 | point = pointData[pid]
94 | writer.write("{}\t{}\t{}\n".format(pid,point[0][0],point[0][1]))
95 |
96 | with open("{}/gtClusters.tsv".format(dataDir), "w") as f:
97 | csvWriter = csv.writer(f, delimiter=" ")
98 | for pid in pointData.keys():
99 | row = [pid, pointData[pid][1]]
100 | csvWriter.writerow(row)
101 |
102 | with open("{}/pairFeatures.csv".format(dataDir), "w") as f:
103 | csvWriter = csv.writer(f)
104 | for pid1, pid2 in itertools.combinations(pointData.keys(), 2):
105 |
106 | featureVec = [abs(x1 - x2) for x1, x2 in zip(pointData[pid1][0], pointData[pid2][0])]
107 | if squared:
108 | featureVec = [x ** 2 for x in featureVec]
109 |
110 | if trimAt is not None and featureVec[0] > trimAt:
111 | featureVec[0], featureVec[1] = featureVec[1], featureVec[0]
112 | featureVec[0] = min(trimAt, featureVec[0])
113 |
114 | if pushMinValTo is not None and featureVec[0] + featureVec[1] < pushMinValTo:
115 | if featureVec[0] < featureVec[1]:
116 | featureVec[1] = pushMinValTo
117 | else:
118 | featureVec[0] = pushMinValTo
119 |
120 | row = [pid1, pid2] + featureVec
121 | if pointData[pid1][1] == pointData[pid2][1]:
122 | row.append(1)
123 | else:
124 | row.append(0)
125 |
126 | csvWriter.writerow(row)
127 |
128 | if __name__ == "__main__":
129 |
130 | # Command to generate spiral dataset with some rotation, with 2 spiral where MST and allPairs differ significantly
131 | # python scripts / create_synth_dataset.py - -file =../ data / sprial.txt - -outDir =../ data / spiralSmallRotated
132 |
133 | parser = argparse.ArgumentParser(description='Create dataset with edges = |p1-p2| from points in Rd')
134 |
135 | parser.add_argument('--file', type=str, required=True, help='File containing points in Rd')
136 | parser.add_argument('--outDir', type=str, required=True, help='Directory for newly created dataset')
137 | parser.add_argument('--sq', action="store_true", default=False, help='Square each component of edge?')
138 |
139 | args = parser.parse_args()
140 |
141 | filename = args.file # filename = "../data/sprial.txt"
142 | dataDir = args.outDir # dataDir = "../data/spiral_pw_sqd"
143 |
144 |
145 | # for theta in np.arange(0,3.14,0.1):
146 | for theta in [0.8]:
147 | create_pairwise_spiral(filename=filename,dataDir=dataDir+"/1",squared=args.sq,theta=theta)
148 | canopy = read_canopy_data(dataDir)
149 | plot_clusters_w_edges(canopy=canopy, model=None, filename=dataDir + "/1/edgeData_{:.2f}.png".format(theta))
150 |
151 | # clusters = readClusters_synth(filename)
152 | # points = {}
153 | # for cid in clusters:
154 | # for point in clusters[cid]:
155 | # points[point] = cid
156 | # plotClusters(points, dataDir+"/1/origData.png")
157 |
158 | # canopy = readCanopyData(dataDir)
159 | # plotClustersEdges(canopy=canopy, model=None, filename=dataDir+"/1/edgeData")
160 |
161 | # dataDir = "../data/spiral_pw/1"
162 | # create_pairwise_spiral(filename, dataDir, False)
163 |
164 | # dataDir = "../data/spiral_pw_sqd/1"
165 | # create_pairwise_spiral(filename, dataDir, True)
166 |
167 | # dataDir = "../data/spiral_pw_sqd_trimmed/1"
168 | # create_pairwise_spiral(filename, dataDir, True, 20)
169 |
170 | # dataDir = "../data/spiral_pw_sqd_trimmed_larger/1"
171 | # create_pairwise_spiral(filename, dataDir, True, 20,8)
172 |
--------------------------------------------------------------------------------
/src/models/mahalabonis.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import torch
17 | from torch.autograd import Variable
18 | import numpy as np
19 |
20 |
21 | class MahalanobisDist(torch.nn.Module):
22 |
23 | def __init__(self, config):
24 |
25 | super(MahalanobisDist, self).__init__()
26 | self.config = config
27 | self.inputDim = config.inputDim
28 | self.outputDim = self.inputDim
29 |
30 | self.seqModel = torch.nn.Sequential(
31 | torch.nn.Linear(self.inputDim,self.outputDim, bias=False)
32 | )
33 |
34 | if config.idenInit: # Initialize with Identity Matrix
35 | self.seqModel[0].weight.requires_grad = False
36 | self.seqModel[0].weight.data = torch.eye(self.config.inputDim)
37 | self.seqModel[0].weight.requires_grad = True
38 |
39 | def __str__(self):
40 | printStr = ""
41 | printStr += "-----------------Mahalanobis Distance Learner Parameters-----------------------------" + "\n"
42 | printStr += "inputDim::\t" + str(self.inputDim) + "\n"
43 | printStr += "Layers::" + str(self.seqModel) + "\n"
44 | printStr += "Parameters::" + str(list(self.parameters())) + "\n"
45 | printStr += "-------------------------------------------------------------------"
46 | return printStr
47 |
48 | def getWeightStr(self):
49 | weightStr = "Weight::{}".format(self.seqModel[0].weight)
50 | return weightStr
51 |
52 | # Returns numpy array after transforming every point according to Mahalanobis distance matrix
53 | def transformPoints(self, pointList):
54 |
55 | if self.config.useGPU:
56 | pointList = torch.cuda.FloatTensor(pointList)
57 | else:
58 | pointList = torch.Tensor(pointList)
59 | transformedPointList = self.seqModel(pointList)
60 | if self.config.useGPU:
61 | transformedPointList = transformedPointList.cpu().data.numpy()
62 | else:
63 | transformedPointList = transformedPointList.data.numpy()
64 |
65 | return transformedPointList
66 |
67 | def pairForward(self, pairFeature):
68 | if self.config.useGPU:
69 | pairFeature = Variable(
70 | torch.cuda.FloatTensor(pairFeature)) # take difference of two vectors to send as input
71 | else:
72 | pairFeature = Variable(torch.Tensor(pairFeature)) # take difference of two vectors to send as input
73 |
74 | prediction = torch.norm(self.seqModel(pairFeature),p=2).view(1)
75 | return prediction
76 |
77 | def pairBatchForward(self, pairFeatureList):
78 | listLen = len(pairFeatureList)
79 | if self.config.useGPU:
80 | pairFeatureList = Variable(
81 | torch.cuda.FloatTensor(pairFeatureList)) # take difference of two vectors to send as input
82 | else:
83 | pairFeatureList = Variable(torch.Tensor(pairFeatureList)) # take difference of two vectors to send as input
84 |
85 | prediction = torch.norm(self.seqModel(pairFeatureList),dim=1,p=2).view(listLen,1)
86 | assert prediction.shape == torch.Size([listLen,1])
87 | return prediction
88 |
89 | def forward(self, point1, point2):
90 |
91 | if self.config.useGPU:
92 | p1 = torch.cuda.FloatTensor(point1)
93 | p2 = torch.cuda.FloatTensor(point2)
94 | else:
95 | p1 = torch.Tensor(point1)
96 | p2 = torch.Tensor(point2)
97 |
98 | embed1 = self.seqModel(p1)
99 | embed2 = self.seqModel(p2)
100 | distance = torch.norm(embed1 - embed2,p=2)
101 | return distance
102 |
103 | # This function does not return a pytorch Variable.
104 | # Just the Mahalabonis distance between point1 and point2
105 | def forwardPlain(self, point1, point2):
106 |
107 | distance = self.forward(point1, point2)
108 | if self.config.useGPU:
109 | distance = distance.cpu().data.numpy()
110 | else:
111 | distance = distance.data.numpy()
112 | return distance
113 |
114 | # Takes list of points and returns an adjacency matrix for it of size n x n
115 | def batchForwardWithin(self, points):
116 | numPoints = len(points)
117 | if self.config.useGPU:
118 | pointList1 = torch.cuda.FloatTensor(points)
119 | pointList2 = torch.cuda.FloatTensor(points)
120 | else:
121 | pointList1 = torch.Tensor(points)
122 | pointList2 = torch.Tensor(points)
123 |
124 | embedList1 = self.seqModel(pointList1).view(numPoints, 1, self.outputDim)
125 | embedList2 = self.seqModel(pointList2).view(1, numPoints, self.outputDim)
126 |
127 | # Use broadcasting feature to get nXn matrix where (i,j) contains ||p_i - p_j||_2
128 | distMatrix = torch.norm(embedList1 - embedList2, p=2, dim=2).view(numPoints, numPoints)
129 |
130 | return distMatrix
131 |
132 | # Takes list of 2 points and returns an adjacency matrix for them of size n1 x n2
133 | def batchForwardAcross(self, pointList1, pointList2):
134 | numPoint1 = len(pointList1)
135 | numPoint2 = len(pointList2)
136 | if self.config.useGPU:
137 | pointList1 = torch.cuda.FloatTensor(pointList1)
138 | pointList2 = torch.cuda.FloatTensor(pointList2)
139 | else:
140 | pointList1 = torch.Tensor(pointList1)
141 | pointList2 = torch.Tensor(pointList2)
142 |
143 | embedList1 = self.seqModel(pointList1).view(numPoint1, 1, self.outputDim)
144 | embedList2 = self.seqModel(pointList2).view(1, numPoint2, self.outputDim)
145 |
146 | # Use broadcasting feature to get nXn matrix where (i,j) contains ||p_i - p_j||_2
147 | distMatrix = torch.norm(embedList1 - embedList2, p=2, dim=2).view(numPoint1, numPoint2)
148 | return distMatrix
149 |
150 | # Returns distance between corresponding points in list 1 and list 2
151 | def batchForwardOneToOne(self, pointList1, pointList2):
152 | assert (len(pointList1) == len(pointList2))
153 | numPoints = len(pointList1)
154 | if self.config.useGPU:
155 | pointList1 = torch.cuda.FloatTensor(pointList1).view(numPoints, self.inputDim)
156 | pointList2 = torch.cuda.FloatTensor(pointList2).view(numPoints, self.inputDim)
157 | else:
158 | pointList1 = torch.Tensor(pointList1).view(numPoints, self.inputDim)
159 | pointList2 = torch.Tensor(pointList2).view(numPoints, self.inputDim)
160 |
161 | embedList1 = self.seqModel(pointList1)
162 | embedList2 = self.seqModel(pointList2)
163 |
164 | distMatrix = (torch.norm(embedList1 - embedList2, p=2, dim=1)).view(numPoints, 1)
165 | return distMatrix
166 |
167 | class GenLinkMahalanobis(MahalanobisDist):
168 |
169 | def __init__(self, config):
170 | super(GenLinkMahalanobis, self).__init__(config)
171 |
172 | tempAlphaVal = np.random.normal(self.config.alphaInitMu, self.config.alphaInitSigma, 1)[0]
173 | if self.config.useGPU:
174 | self.linkAlpha = Variable(torch.cuda.FloatTensor([tempAlphaVal]), requires_grad=True)
175 | else:
176 | self.linkAlpha = Variable(torch.FloatTensor([tempAlphaVal]), requires_grad=True)
177 |
178 | def __str__(self):
179 | printStr = ""
180 | printStr += "-----------------General Linkage with Mahalanobis Distance Matrix: Parameters-----------------------------" + "\n"
181 | printStr += "linkAlpha::\t" + str(self.linkAlpha) + "\n"
182 | printStr += "inputDim::\t" + str(self.inputDim) + "\n"
183 | printStr += "Layers::" + str(self.seqModel) + "\n"
184 | printStr += "Parameters::" + str(list(self.parameters())) + "\n"
185 | printStr += "-------------------------------------------------------------------\n"
186 | return printStr
187 |
--------------------------------------------------------------------------------
/src/trainer/train_vect_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import argparse, time, sys, os
17 | from pathlib import Path
18 | import torch
19 |
20 | from models.mahalabonis import MahalanobisDist, GenLinkMahalanobis
21 |
22 | from utils.Config import Config
23 | from utils.basic_utils import create_logger
24 | from eval.finalEval import run_final_eval
25 |
26 | from trainer.VectDataTrainer import VectDataTrainer
27 |
28 | def trainExpLinkOnly(trainer):
29 | if trainer.config.trainObj == "linkage_auto":
30 | trainer.logger.info("Not training linkageAlpha separately because if trainObj is linakge_auto then it must be trained already...")
31 | elif trainer.config.modelType == "maha":
32 |
33 | assert isinstance(trainer.model, MahalanobisDist)
34 |
35 | new_model = GenLinkMahalanobis(trainer.config)
36 | new_model.seqModel[0].weight.requires_grad = False
37 | new_model.seqModel[0].weight.data = trainer.model.seqModel[0].weight.data
38 | new_model.seqModel[0].weight.requires_grad = True
39 | trainer.model = new_model
40 | if trainer.config.useGPU:
41 | trainer.logger.info("Shifting model to cuda because GPUs are available!")
42 | trainer.model = trainer.model.cuda()
43 |
44 | trainer.config.trainAlpha = True
45 | trainer.config.trainModel = False
46 | trainer.resetOptimizer()
47 |
48 | if "linkage_auto" not in trainer.config.inferenceMethods:
49 | trainer.config.inferenceMethods += ["linkage_auto"]
50 | if "linkage_auto@t" not in trainer.config.inferenceMethods:
51 | trainer.config.inferenceMethods += ["linkage_auto@t"]
52 |
53 | origCSVFile = "{}/origTraining/results.csv"
54 | fileCheck = Path(origCSVFile.format(trainer.config.resultDir))
55 | if not fileCheck.is_file():
56 | print("File does not exist:{}".format(origCSVFile))
57 | command = "cd {} && mkdir -p origTraining && cp *.csv origTraining/ && cp *.png origTraining/".format(trainer.config.resultDir)
58 | os.system(command)
59 |
60 | trainer.config.trainObj = "linkage_auto"
61 | trainer.logger.info("Training alpha parameter of expLink ...\n\n\n")
62 | trainer.logger.info(trainer.model)
63 |
64 | t1 = time.time()
65 | success = trainer.train()
66 | if success is not None and (not success):
67 | try:
68 | trainer.config.inferenceMethods.remove("linkage_auto@t")
69 | trainer.config.inferenceMethods.remove("linkage_auto")
70 | except:
71 | pass
72 |
73 |
74 |
75 | trainer.printModelWeights()
76 | trainer.logger.info("Training alpha parameter of expLink linkage ends...in time={:.3f}".format(time.time() - t1))
77 | trainer.logger.info("Saving model...")
78 |
79 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model_alpha.torch")
80 | torch.save(trainer.model, trainer.config.bestModel )
81 | trainer.config.save_config(trainer.config.resultDir, "config_expLink.json")
82 | trainer.logger.info("Saved model...")
83 |
84 | else:
85 | trainer.logger.info("Not training linkageAlpha separately because if modelType is not Mahalanobis distance matrix... ")
86 |
87 | def runMain(config):
88 | command = sys.argv
89 | start = time.time()
90 | assert isinstance(config,Config)
91 | if config.mode == "train":
92 | trainer = VectDataTrainer(config)
93 | trainer.printModelWeights()
94 |
95 | t1 = time.time()
96 | trainer.train()
97 |
98 | trainer.logger.info("Training ends...in time={:.3f}".format(time.time() - t1))
99 | trainer.printModelWeights()
100 | trainer.logger.info("Saving model...")
101 |
102 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model.torch")
103 | torch.save(trainer.model, trainer.config.bestModel )
104 | trainer.config.save_config(trainer.config.resultDir)
105 | trainer.logger.info("Saved model...")
106 |
107 | ################### Train alpha parameter for softLink ##########################
108 |
109 | if config.trainExpLink:
110 | trainExpLinkOnly(trainer)
111 | #################################################################################
112 |
113 | elif config.mode == "trainExpLink":
114 | trainer = VectDataTrainer(config)
115 |
116 | # Load model and reset optimizer to have parameters of the loaded model
117 | trainer.loadModel()
118 |
119 | # Update output directory
120 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix
121 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
122 |
123 | # Update logger object
124 | trainer.logger = create_logger(config=config, logFile="logFile_trainExpLink.txt", currLogger=trainer.logger)
125 |
126 | trainer.logger.info(trainer)
127 | trainer.logger.info(command)
128 | trainExpLinkOnly(trainer)
129 |
130 | elif config.mode == "test":
131 | trainer = VectDataTrainer(config)
132 |
133 | # Load model and reset optimizer to have parameters of the loaded model
134 | trainer.loadModel()
135 |
136 | # Update output directory
137 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix
138 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
139 |
140 | # Update logger object
141 | trainer.logger = create_logger(config=config, logFile="logFile_retest.txt", currLogger=trainer.logger)
142 |
143 | trainer.logger.info(command)
144 | trainer.logger.info(trainer)
145 |
146 | else:
147 | raise Exception("Invalid mode = {}. Choose one from: test, train".format(config.mode))
148 |
149 |
150 | run_final_eval(trainer)
151 | # trainer.performFinalEvaluation()
152 | trainer.logger.info("\n\n\n\n")
153 |
154 | trainer.logger.info(trainer)
155 | trainer.logger.info(command)
156 | end = time.time()
157 | trainer.logger.info(" Total time taken = {:.4f} = {:.4f} min = {:.4f} hours".format(end - start, (end - start)/60, (end - start)/3600))
158 |
159 | if __name__ == '__main__':
160 | parser = argparse.ArgumentParser( description='Supervised clustering training for data in R^n')
161 | parser.add_argument('--config', type=str, help="Config file")
162 |
163 | ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS###################################################
164 | temp_config = Config()
165 | for config_arg in temp_config.__dict__:
166 | def_val = temp_config.__getattribute__(config_arg)
167 | arg_type = type(def_val) if def_val is not None else str
168 | parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used')
169 | #########################################################################################################
170 |
171 | args = parser.parse_args()
172 |
173 | assert args.config is not None
174 | config = Config(args.config)
175 | for config_arg in temp_config.__dict__:
176 | def_val = getattr(args, config_arg)
177 | if def_val is not None:
178 | old_val = config.__dict__[config_arg]
179 | config.__dict__.update({config_arg:def_val})
180 | new_val =config.__dict__[config_arg]
181 | print("Updating Config.{} from {} to {} using arg_val={}".format(config_arg, old_val, new_val, def_val))
182 |
183 | # Update result directory if there are any parameters passed through command line that are different from those in config file
184 | if args.resultDir is None:
185 | config.updateResultDir("auto")
186 | else:
187 | config.updateResultDir(args.resultDir)
188 |
189 | Path(config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
190 | config.useGPU = config.cuda and torch.cuda.is_available()
191 | config.updateRandomSeeds(config.seed)
192 | config.save_config(config.resultDir, "orig_config.json")
193 | runMain(config)
194 |
195 |
196 |
197 |
198 |
--------------------------------------------------------------------------------
/src/trainer/train_pair_feat.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 |
17 | import argparse, time, sys, os
18 | from pathlib import Path
19 | import torch
20 |
21 |
22 | from utils.Config import Config
23 | from utils.basic_utils import create_logger
24 | from eval.finalEval import run_final_eval
25 |
26 | from models.linearClassifier import LinearClassifier
27 | from trainer.PairFeatureTrainer import PairFeatureTrainer
28 |
29 | def trainExpLinkOnly(trainer):
30 |
31 | assert isinstance(trainer, PairFeatureTrainer)
32 |
33 | if trainer.config.trainObj == "linkage_auto":
34 | trainer.logger.info("Not training linkageAlpha separately because if trainObj is linakge_auto then it must be trained already...")
35 | elif (trainer.config.modelType == "avgLinear" or trainer.config.modelType == "linear"):
36 |
37 | if trainer.config.modelType == "avgLinear":
38 | newModel = LinearClassifier(trainer.config)
39 | newModel.seqModel[0].weight.data = trainer.model.avgWeights.weight.data
40 | if trainer.model.seqModel[0].bias is not None:
41 | newModel.seqModel[0].bias.data = trainer.model.avgWeights.bias.data
42 |
43 | trainer.model = newModel
44 | elif trainer.config.modelType == "linear":
45 | newModel = LinearClassifier(trainer.config)
46 | newModel.seqModel[0].weight.data = trainer.model.seqModel[0].weight.data
47 | if trainer.model.seqModel[0].bias is not None:
48 | newModel.seqModel[0].bias.data = trainer.model.seqModel[0].bias.data
49 |
50 | trainer.model = newModel
51 | else:
52 | raise Exception("Invalid modelType..{}".format(trainer.config.modelType))
53 |
54 | if trainer.config.useGPU:
55 | trainer.logger.info("Shifting model to cuda because GPUs are available!")
56 | trainer.model = trainer.model.cuda()
57 |
58 | trainer.config.trainAlpha = True
59 | trainer.config.trainModel = False
60 | trainer.resetOptimizer()
61 |
62 | if "linkage_auto" not in trainer.config.inferenceMethods:
63 | trainer.config.inferenceMethods += ["linkage_auto"]
64 | if "linkage_auto@t" not in trainer.config.inferenceMethods:
65 | trainer.config.inferenceMethods += ["linkage_auto@t"]
66 |
67 | origCSVFile = "{}/origTraining/results.csv"
68 | fileCheck = Path(origCSVFile.format(trainer.config.resultDir))
69 | if not fileCheck.is_file():
70 | print("File does not exist:{}".format(origCSVFile))
71 | command = "cd {} && mkdir -p origTraining && cp *.csv origTraining/ && cp *.png origTraining/".format(trainer.config.resultDir)
72 | os.system(command)
73 |
74 | trainer.config.trainObj = "linkage_auto"
75 | trainer.logger.info("Training alpha parameter of expLink ...\n\n\n")
76 | trainer.logger.info(trainer.model)
77 |
78 | trainT1 = time.time()
79 | success = trainer.train()
80 | if success is not None and (not success):
81 | try:
82 | trainer.config.inferenceMethods.remove("linkage_auto@t")
83 | trainer.config.inferenceMethods.remove("linkage_auto")
84 | except:
85 | pass
86 |
87 | trainer.printModelWeights()
88 |
89 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model_alpha.torch")
90 | torch.save(trainer.model, trainer.config.bestModel )
91 | trainer.config.save_config(trainer.config.resultDir, "config_expLink.json")
92 |
93 | trainT2 = time.time()
94 | trainer.logger.info("Training alpha parameter of expLink linkage ends in time={:.3f} = {:.3f} min = {:.3f} hr \n\n\n".format(trainT2 - trainT1,(trainT2 - trainT1)/60, (trainT2 - trainT1)/3600))
95 | else:
96 | trainer.logger.info("Not training linkageAlpha separately because if modelType is not linear or avgLinear... ")
97 |
98 | def runMain(config):
99 | assert isinstance(config,Config)
100 |
101 | command = sys.argv
102 | start = time.time()
103 |
104 | if config.mode == "train":
105 | trainer = PairFeatureTrainer(config)
106 | trainer.logger.info(command)
107 |
108 | trainer.logger.info("Inital Weights of the model...")
109 | trainer.printModelWeights()
110 |
111 |
112 | trainT1 = time.time()
113 | trainer.train()
114 | trainT2 = time.time()
115 | trainer.logger.info("Training ends in time={:.3f} = {:.3f} min = {:.3f} hr Saving model".format(trainT2 - trainT1,(trainT2 - trainT1)/60,(trainT2 - trainT1)/3600))
116 |
117 | trainer.logger.info("Weights that the model converged to...")
118 | trainer.printModelWeights()
119 |
120 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model.torch")
121 | torch.save(trainer.model, trainer.config.bestModel )
122 | trainer.config.save_config(trainer.config.resultDir)
123 | trainer.logger.info("Saved model...")
124 |
125 | if config.trainExpLink:
126 | trainExpLinkOnly(trainer)
127 |
128 | elif config.mode == "trainExpLink":
129 | trainer = PairFeatureTrainer(config)
130 | trainer.logger.info(command)
131 |
132 | # Load model and reset optimizer to have parameters of the loaded model
133 | trainer.loadModel()
134 |
135 | # Update output directory
136 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix
137 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
138 |
139 | # Update logger object
140 | trainer.logger = create_logger(config=config, logFile="logFile_trainExpLink.txt", currLogger=trainer.logger)
141 |
142 | trainer.logger.info(trainer)
143 | trainExpLinkOnly(trainer)
144 |
145 | elif config.mode == "test":
146 | trainer = PairFeatureTrainer(config)
147 | trainer.logger.info(command)
148 |
149 | # Load model and reset optimizer to have parameters of the loaded model
150 | trainer.loadModel()
151 |
152 | # Update output directory
153 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix
154 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
155 |
156 | # Update logger object
157 | trainer.logger = create_logger(config=config, logFile="logFile_retest.txt", currLogger=trainer.logger)
158 |
159 | else:
160 | raise Exception("Invalid mode = {}. Choose one from: test, train or trainExpLink".format(config.mode))
161 |
162 |
163 | t1 = time.time()
164 | run_final_eval(trainer)
165 | t2 = time.time()
166 | trainer.logger.info(" Total time taken for final evaluation = {:.4f} = {:.4f} min = {:.4f} hours".format(t2 - t1, (t2 - t1)/60, (t2 - t1)/3600))
167 |
168 | trainer.logger.info(trainer)
169 | trainer.logger.info(command)
170 | end = time.time()
171 | trainer.logger.info(" Total time taken = {:.4f} = {:.4f} min = {:.4f} hours".format(end - start, (end - start)/60, (end - start)/3600))
172 |
173 | if __name__ == '__main__':
174 | parser = argparse.ArgumentParser( description='Supervised clustering training with features given on every pair of points')
175 |
176 | temp_config = Config()
177 | parser.add_argument('--config', type=str, help="Config file")
178 | ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS###################################################
179 | for config_arg in temp_config.__dict__:
180 | def_val = temp_config.__getattribute__(config_arg)
181 | arg_type = type(def_val) if def_val is not None else str
182 | parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used')
183 | #########################################################################################################
184 |
185 | args = parser.parse_args()
186 |
187 | assert args.config is not None
188 | config = Config(args.config)
189 | for config_arg in temp_config.__dict__:
190 | def_val = getattr(args, config_arg)
191 | if def_val is not None:
192 |
193 | old_val = config.__dict__[config_arg]
194 | config.__dict__.update({config_arg:def_val})
195 | new_val =config.__dict__[config_arg]
196 | print("Updating Config.{} from {} to {} using arg_val={}".format(config_arg, old_val, new_val, def_val))
197 |
198 | # Update result directory if there are any parameters passed through command line that are different from those in config file
199 | if args.resultDir is None:
200 | config.updateResultDir("auto")
201 | else:
202 | config.updateResultDir(args.resultDir)
203 |
204 | Path(config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
205 | config.useGPU = config.cuda and torch.cuda.is_available()
206 | config.updateRandomSeeds(config.seed)
207 | config.save_config(config.resultDir, "orig_config.json")
208 |
209 | runMain(config)
210 |
--------------------------------------------------------------------------------
/src/trainer/scipy_perceptron.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 |
17 | import numpy as np,argparse
18 | import torch
19 | from pathlib import Path
20 |
21 | from sklearn.linear_model import Perceptron
22 | from sklearn.linear_model import SGDClassifier
23 | from sklearn.svm import SVC
24 |
25 | from eval.evalPairFeat import get_conn_comp_pair_feat
26 | from utils.Config import Config
27 | from utils.plotting import plot_clusters_w_edges, plot_clusters
28 | from eval.threshold import choose_threshold
29 |
30 | from models.linearClassifier import LinearClassifier
31 | from models.templateClassifier import Classifier
32 | from PairFeatureTrainer import PairFeatureTrainer
33 |
34 | def getBestClassifier(modelType,seed,X,Y):
35 | classifiers = {}
36 | np.random.seed(seed)
37 | for i in range(10):
38 | if modelType == "SVMLinear":
39 | clf = SGDClassifier(loss="hinge", penalty="l2", tol=1e-9, alpha=0.01, max_iter=1000) # Linear SVM
40 | elif modelType == "SVMRbf":
41 | clf = SVC(gamma='auto', tol=1e-9, )
42 | elif modelType == "Perceptron":
43 | clf = SGDClassifier(loss="perceptron", penalty="l2", tol=1e-9, alpha=0.01, max_iter=1000)
44 | elif modelType == "AvgPerceptron":
45 | clf = SGDClassifier(loss="perceptron", penalty="l2", tol=1e-9, alpha=0.01, max_iter=1000,average=True)
46 | elif modelType == "MST":
47 | clf = Perceptron(random_state=config.seed, penalty="l2", max_iter=1000, alpha=0.01, tol=1e-5, warm_start=True,shuffle=True)
48 | clf.fit(X, Y) # Doint this to just to get other class variable initialized
49 | # Optimal parameters as learnt by MST objective
50 | clf.coef_ = np.array([[-0.092749, -0.076006]])
51 | clf.intercept_ = np.array([0.3871])
52 | else:
53 | raise Exception("Invalid Model:{}".format(modelType))
54 |
55 | # Initializing parameters. Need to set warm_start to True for this purpose. If shuffle is False then we get
56 | # same results for every random_state but if shuffle is True then we get different parameters because data is shuffled
57 | # at every iteration
58 | # clf = Perceptron(random_state=args.seed, penalty="l2", max_iter=1000, alpha=0.01, tol=1e-5, warm_start=True,shuffle=True)
59 | # clf.coef_ = np.array([[1,1]])
60 | # clf.intercept_ = np.array([0])
61 | # clf.fit(X, Y)
62 | # Optimal parameters as learnt by MST objective
63 | # clf.coef_ = np.array([[-0.092749, -0.076006]])
64 | # clf.intercept_ = np.array([0.3871])
65 |
66 | if modelType != "MST":
67 | clf.fit(X, Y)
68 | score = clf.score(X, Y)
69 | print("Accuracy on train data:{:.3f}".format(score))
70 | classifiers[i] = (clf,score)
71 |
72 | bestClf = None
73 | bestScore = 0
74 | for i in classifiers.keys():
75 | if bestClf is None or bestScore < classifiers[i][1]:
76 | bestClf = classifiers[i][0]
77 | bestScore = classifiers[i][1]
78 |
79 | print("Model with best Accuracy on train data:{:.3f}".format(bestScore))
80 | return bestClf
81 |
82 | if __name__ == "__main__":
83 |
84 | parser = argparse.ArgumentParser("Run Scipy perceptron on pairwise data(synthetic points in R2)")
85 | parser.add_argument('--config', type=str, help="Config file")
86 |
87 | temp_config = Config()
88 | ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS###################################################
89 | for config_arg in temp_config.__dict__:
90 | def_val = temp_config.__getattribute__(config_arg)
91 | arg_type = type(def_val) if def_val is not None else str
92 | parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used')
93 | #########################################################################################################
94 |
95 | args = parser.parse_args()
96 |
97 | assert args.config is not None
98 | config = Config(args.config)
99 | for config_arg in temp_config.__dict__:
100 | def_val = getattr(args, config_arg)
101 | if def_val is not None:
102 |
103 | old_val = config.__dict__[config_arg]
104 | config.__dict__.update({config_arg:def_val})
105 | new_val =config.__dict__[config_arg]
106 | print("Updating Config.{} from {} to {} using arg_val={}".format(config_arg, old_val, new_val, def_val))
107 |
108 | # Update result directory if there are any parameters passed through command line that are different from those in config file
109 | if args.resultDir is None:
110 | config.updateResultDir("auto")
111 | else:
112 | config.updateResultDir(args.resultDir)
113 |
114 | Path(config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
115 | config.useGPU = config.cuda and torch.cuda.is_available()
116 | config.updateRandomSeeds(config.seed)
117 | config.save_config(config.resultDir, "orig_config.json")
118 |
119 | trainer = PairFeatureTrainer(config)
120 | resultDir = trainer.config.resultDir
121 |
122 | X,Y = [],[]
123 | for canopyId in trainer.trainCanopies:
124 | canopy = trainer.trainCanopies[canopyId]
125 | for (p1,p2) in canopy["pairFeatures"]:
126 | X.append(canopy["pairFeatures"][(p1,p2)])
127 | label = 1 if canopy["pidToCluster"][p1] == canopy["pidToCluster"][p2] else 0
128 | Y.append(label)
129 |
130 | X, Y = np.array(X), np.array(Y)
131 | clf = getBestClassifier(config.model, config.seed, X, Y)
132 |
133 | if "spiral" in trainer.config.dataDir:
134 | pidToPoint = {}
135 | with open("{}/1/pidToPoint.txt".format(trainer.config.dataDir)) as f:
136 | for line in f:
137 | lineV = line.strip().split()
138 | pid, x1, x2 = int(lineV[0]), float(lineV[1]), float(lineV[2])
139 | pidToPoint[pid] = (x1, x2)
140 |
141 |
142 | if hasattr(clf,"coef_"):
143 | b = clf.intercept_[0]
144 | m1,m2 = clf.coef_[0][0],clf.coef_[0][1]
145 |
146 | assert isinstance(trainer.model, LinearClassifier)
147 | trainer.model.seqModel[0].weight.data = torch.cuda.FloatTensor([[m1, m2]]) if config.useGPU else torch.FloatTensor([[m1, m2]])
148 | trainer.model.seqModel[0].bias.data = torch.cuda.FloatTensor([b]) if config.useGPU else torch.FloatTensor([b])
149 | optThresh = choose_threshold(trainer,"connComp", "1", trainer.trainCanopies)
150 |
151 | model = (m1, m2, b)
152 | optModel = (m1, m2, b - optThresh)
153 | plot_clusters_w_edges(trainer.trainCanopies, model, "{}/boundary_{}.pdf".format(resultDir, config.seed))
154 | plot_clusters_w_edges(trainer.trainCanopies, optModel, "{}/boundaryOpt_{}.pdf".format(resultDir, config.seed))
155 | # plotClustersEdges(trainer.trainCanopies, optModel, "{}/boundaryOptWithBase_{}.pdf".format(resultDir, config.seed), baseModel=model)
156 | plot_clusters_w_edges(trainer.trainCanopies, model, "{}/boundaryOptWithBase_{}.pdf".format(resultDir, config.seed), baseModel=optModel)
157 | elif isinstance(clf,SVC):
158 | trainer.model = Classifier(config)
159 | trainer.model.clf = clf
160 | optThresh = choose_threshold(trainer,"connComp", "1", trainer.trainCanopies)
161 | print("Opt threshold = {}".format(optThresh))
162 |
163 | plot_clusters_w_edges(trainer.trainCanopies, clf, "{}/boundary_{}.png".format(resultDir, config.seed))
164 | else:
165 | raise Exception("Invalid model:{}",clf)
166 |
167 | for canopyId in trainer.trainCanopies:
168 | canopy = trainer.trainCanopies[canopyId]
169 | pidToPredCluster = get_conn_comp_pair_feat(model=trainer.model, pairFeatures=canopy["pairFeatures"],
170 | pidToCluster=canopy["pidToCluster"], threshold=optThresh)
171 | pointToPredCluster = {}
172 | pointToTrueCluster = {}
173 | for pid in pidToPredCluster:
174 | point = pidToPoint[pid]
175 | pointToPredCluster[point] = pidToPredCluster[pid]
176 | pointToTrueCluster[point] = canopy["pidToCluster"][pid]
177 |
178 |
179 | plot_clusters(pointToCluster=pointToPredCluster, filename=trainer.config.resultDir + "/predClusterOptThresh_{}.pdf".format(config.seed))
180 | plot_clusters(pointToCluster=pointToTrueCluster, filename=trainer.config.resultDir + "/trueCluster.pdf")
181 |
182 | pidToPredCluster = get_conn_comp_pair_feat(model=trainer.model, pairFeatures=canopy["pairFeatures"],
183 | pidToCluster=canopy["pidToCluster"], threshold=0)
184 | pointToPredCluster = {}
185 | pointToTrueCluster = {}
186 | for pid in pidToPredCluster:
187 | point = pidToPoint[pid]
188 | pointToPredCluster[point] = pidToPredCluster[pid]
189 | pointToTrueCluster[point] = canopy["pidToCluster"][pid]
190 | plot_clusters(pointToCluster=pointToPredCluster, filename=trainer.config.resultDir + "/predClusterLearnt.pdf".format(config.seed))
191 |
--------------------------------------------------------------------------------
/src/eval/finalEval.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 |
17 | from eval.evalPairFeat import eval_model_pair_feat_per_canopy
18 | from models.mahalabonis import MahalanobisDist
19 | from models.linearClassifier import AvgLinearClassifier, LinearClassifier
20 | from eval.threshold import choose_threshold
21 | from utils.plotting import write_scores_comb,write_scores_separate, plot_scores_per_canopy, plot_scores
22 |
23 | # Perform final evaluation of model
24 | def run_final_eval(trainer):
25 |
26 | # assert isinstance(trainer, VectDataTrainer) or isinstance(trainer, PairFeatureTrainer)
27 |
28 | trainer.logger.info("Choosing best threshold for evaluation in the end...")
29 | if isinstance(trainer.model, AvgLinearClassifier):
30 | trainer.logger.info("Loading average weights")
31 | trainer.model.seqModel[0].weight.data = trainer.model.avgWeights.weight.data
32 | if trainer.model.seqModel[0].bias is not None:
33 | trainer.model.seqModel[0].bias.data = trainer.model.avgWeights.bias.data
34 |
35 | trainer.logger.info("Weights being used for performing evaluation...")
36 | trainer.printModelWeights()
37 |
38 | trainer.config.threshold = None # Uncomment this line if you want to chooseThreshold
39 | ############################### Choose threshold based on dev canopy######################################
40 | threshDict = {}
41 | for method in trainer.config.inferenceMethods:
42 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestDev")
43 |
44 | trainer.logger.info("Using dev thresholdVals:{}".format(threshDict))
45 | eval_all_data(trainer, threshDict, "/BestDevThresh")
46 | ###########################################################################################################
47 |
48 | ############################### Choose threshold based on test canopy######################################
49 | if len(trainer.testCanopies) > 0 and trainer.config.evalOnTestThresh:
50 | threshDict = {}
51 | for method in trainer.config.inferenceMethods:
52 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTest", canopies=trainer.testCanopies)
53 |
54 | trainer.logger.info("Using test thresholdVals:{}".format(threshDict))
55 | eval_all_data(trainer, threshDict, "/BestTestThresh")
56 | ###########################################################################################################
57 |
58 | ##################################### Choose threshold based on train canopy##############################
59 | if trainer.config.evalOnTrainThresh:
60 | threshDict = {}
61 | for method in trainer.config.inferenceMethods:
62 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTrain", canopies=trainer.trainCanopies)
63 |
64 | trainer.logger.info("Using train thresholdVals:{}".format(threshDict))
65 | eval_all_data(trainer, threshDict, "/BestTrainThresh")
66 | ###########################################################################################################
67 | pass
68 |
69 | def eval_all_data(trainer, threshDict, relResultDir = None):
70 | allScores = {"train": {}, "test": {}, "dev": {}}
71 |
72 | # Not using config.infertenceMethods as sometimes we want to just evaluate on just 1 inference methods during training
73 | infMethods = [method for method in threshDict.keys()]
74 |
75 | allScores["test"][0] = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=trainer.testCanopies, threshDict=threshDict,
76 | inferenceMethods=infMethods, metricsForEval=trainer.config.metricsForEval)
77 |
78 | allScores["dev"][0] = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=trainer.devCanopies, threshDict=threshDict,
79 | inferenceMethods=infMethods, metricsForEval=trainer.config.metricsForEval)
80 |
81 | allScores["train"][0] = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=trainer.trainCanopies, threshDict=threshDict,
82 | inferenceMethods=infMethods, metricsForEval=trainer.config.metricsForEval)
83 |
84 | if relResultDir is not None:
85 | if trainer.config.makeScorePlots:
86 | plot_scores(allLosses={"train":{}, "test":{}, "dev":{}}, allScores=allScores,
87 | currResultDir=trainer.config.resultDir + relResultDir, xlabel="Threshold")
88 |
89 | write_scores_comb(allLosses={"train": {}, "test": {}, "dev": {}}, allScores=allScores,
90 | currResultDir=trainer.config.resultDir + relResultDir, xlabel="Threshold")
91 |
92 | write_scores_separate(allLosses={"train": {}, "test": {}, "dev": {}}, allScores=allScores,
93 | currResultDir=trainer.config.resultDir + relResultDir, xlabel="Threshold")
94 |
95 | return allScores["train"][0], allScores["test"][0], allScores["dev"][0]
96 |
97 | def run_final_eval_per_canopy(trainer):
98 |
99 | from trainer.PairFeatureTrainer import PairFeatureTrainer
100 | assert isinstance(trainer, PairFeatureTrainer)
101 |
102 | trainer.logger.info("Choosing optimal threshold and running model with average weights for that...".format())
103 |
104 | if isinstance(trainer.model, AvgLinearClassifier):
105 | trainer.model.seqModel[0].weight.data = trainer.model.avgWeights.weight.data
106 | if trainer.model.seqModel[0].bias is not None:
107 | trainer.model.seqModel[0].bias.data = trainer.model.avgWeights.bias.data
108 |
109 | if isinstance(trainer.model, MahalanobisDist) or isinstance(trainer.model, LinearClassifier):
110 | trainer.logger.info("Weights being used for performing evalutaion...")
111 | trainer.logger.info("Weight::{}".format(trainer.model.seqModel[0].weight))
112 | trainer.logger.info("Bias::{}".format(trainer.model.seqModel[0].bias))
113 |
114 | trainer.logger.info("Choosing best threshold for evaluation in the end...")
115 |
116 | trainer.config.threshold = None # Uncomment this line if you want to chooseThreshold
117 |
118 | ############################### Choose threshold based on dev canopy######################################
119 | threshDict = {}
120 | for method in trainer.config.inferenceMethods:
121 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestDev")
122 |
123 | trainer.logger.info("Using dev thresholdVals:{}".format(threshDict))
124 | eval_all_data_per_canopy(trainer, threshDict, "/BestDevThresh")
125 | ###########################################################################################################
126 |
127 |
128 | ############################### Choose threshold based on test canopy######################################
129 | if len(trainer.testCanopies) > 0:
130 | threshDict = {}
131 | for method in trainer.config.inferenceMethods:
132 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTest",
133 | canopies=trainer.testCanopies)
134 |
135 | trainer.logger.info("Using test thresholdVals:{}".format(threshDict))
136 | eval_all_data_per_canopy(trainer, threshDict, "/BestTestThresh")
137 | ###########################################################################################################
138 |
139 | ####################################3# Choose threshold based on train canopy##############################
140 | threshDict = {}
141 | for method in trainer.config.inferenceMethods:
142 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTrain",
143 | canopies=trainer.trainCanopies)
144 |
145 | trainer.logger.info("Using train thresholdVals:{}".format(threshDict))
146 | eval_all_data_per_canopy(trainer, threshDict, "/BestTrainThresh")
147 | ###########################################################################################################
148 | pass
149 |
150 | def eval_all_data_per_canopy(trainer, threshDict, relResultDir):
151 | allScores = {}
152 |
153 | # def eval_model_pair_feat_per_canopy(model, canopies, inferenceMethods, threshDict, logger, metricsForEval)
154 |
155 | allScores["test"] = eval_model_pair_feat_per_canopy(model=trainer.model, canopies=trainer.testCanopies, logger=trainer.logger,
156 | threshDict=threshDict, inferenceMethods=trainer.config.inferenceMethods, metricsForEval=trainer.config.metricsForEval)
157 |
158 | allScores["dev"] = eval_model_pair_feat_per_canopy(model=trainer.model, canopies=trainer.devCanopies, logger=trainer.logger,
159 | threshDict=threshDict, inferenceMethods=trainer.config.inferenceMethods, metricsForEval=trainer.config.metricsForEval)
160 |
161 | allScores["train"] = eval_model_pair_feat_per_canopy(model=trainer.model, canopies=trainer.trainCanopies, logger=trainer.logger,
162 | threshDict=threshDict, inferenceMethods=trainer.config.inferenceMethods, metricsForEval=trainer.config.metricsForEval)
163 |
164 | plot_scores_per_canopy(allScores=allScores, currResultDir=trainer.config.resultDir + relResultDir)
165 |
--------------------------------------------------------------------------------
/src/utils/combineResults.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import csv,os,argparse,copy
17 | import numpy as np
18 | from pathlib import Path
19 |
20 | from utils.plotting import plotMetricsFromCSV
21 | from utils.basic_utils import get_filename_list
22 | from utils.Config import Config
23 |
24 | def combineResults(parameters, xlabel, currResultDir, template):
25 | """
26 | Put together results from all files into 1 file
27 | :param parameters: Dictionary with key as parameter names and value as a list of parameter values which need to be combined
28 | :param xlabel: Varying dimension
29 | :param currResultDir:
30 | :param template: template for folder name where results are read from for combining
31 | :return:
32 | """
33 |
34 | filenameList = get_filename_list(parameters, template)
35 | data = {}
36 | header = None
37 | numFiles = 0
38 | # Read data from all files
39 | for filenum,filename in enumerate(filenameList):
40 | data[filenum] = {}
41 |
42 | fileCheck = Path(filename)
43 | if not fileCheck.is_file():
44 | print("pwd:{}".format(os.getcwd()))
45 | print("File does not exist:{}".format(filename))
46 | continue
47 |
48 | numFiles +=1
49 | with open(filename, "r") as f: # Read data from this file into a dictionary
50 | csvReader = csv.DictReader(f)
51 | for row in csvReader:
52 | if header is None: header = list(row.keys()) # Get header names
53 |
54 | for col in header: # Convert all row values to float if they can be else assign 0 value(these columns must be empty)
55 | try:
56 | row[col] = float(row[col])
57 | except ValueError: # ValueError because these col must be empty
58 | assert row[col] == ""
59 | row[col] = 0
60 |
61 | # xlabelValue = None
62 | # for col in header: # Find value of xDim for this row
63 | # if col == xlabel:
64 | # xlabelValue = row[col]
65 | # break
66 | xlabelValue = row[xlabel] if xlabel in row else None
67 |
68 | assert xlabelValue is not None
69 | data[filenum][xlabelValue] = {}
70 | for col in header: # Add data for all col in data dictionary as list of values
71 | if col == xlabel: continue
72 | data[filenum][xlabelValue][col] = row[col]
73 |
74 | assert len(data[filenum]) == 1
75 |
76 | # Compute best result for each file
77 | finalData = {}
78 | for filenum in data:
79 | if len(data[filenum].keys()) == 0:
80 | print("Ignoring file:{}\n".format(filenum))
81 | bestxDimValue = None
82 | continue
83 | else:
84 | assert len(data[filenum]) == 1
85 | bestxDimValue = list(data[filenum].keys())[0]
86 |
87 | bestRow = {}
88 | for col in data[filenum][bestxDimValue]:
89 | bestRow[col] = data[filenum][bestxDimValue][col]
90 | finalData[filenum] = (bestxDimValue,copy.deepcopy(bestRow))
91 |
92 | # Write csv file containing best results from all files
93 | with open(currResultDir + "/results.csv", "w") as f:
94 | csvWriter = csv.DictWriter(f, fieldnames=header+["FileNum"])
95 | csvWriter.writeheader()
96 |
97 | for filenum in range(numFiles):
98 | if filenum in finalData:
99 | tempDict = copy.deepcopy(finalData[filenum][1])
100 | tempDict[xlabel] = finalData[filenum][0] # Add xDim to dictionary when writing data
101 | tempDict["FileNum"] = filenum
102 |
103 | csvWriter.writerow(tempDict)
104 | else:
105 | pass
106 | # print("Filenum not included in best result, possibly because choosing a threshold failed for this file:{}".format(filenum))
107 |
108 | print("\nIgnoring orginal standard deviations when computing avg of best results")
109 | print("File will have standard deviation of best mean scores\n")
110 | # Write csv file containing avg of best results from all files
111 | with open(currResultDir + "/avgOfBestResults.csv", "w") as f:
112 | csvWriter = csv.DictWriter(f, fieldnames=header)
113 | csvWriter.writeheader()
114 |
115 | avgData = {col:[] for col in header}
116 |
117 | for col in header:
118 | if col.endswith("_std"): # If commenting this, also comment computation of std deviation of best scores
119 | continue
120 | # print("Ignoring deviation of best scores, just reporting average of best scores, and average of their std deviations")
121 |
122 | for filenum in range(numFiles):
123 | if filenum in finalData:
124 | if col == xlabel:
125 | avgData[col] += [finalData[filenum][0]]
126 | else:
127 | avgData[col] += [finalData[filenum][1][col]]
128 | else:
129 | pass
130 | # print("Filenum not included in best result, possibly because choosing a threshold failed for this file:{}".format(filenum))
131 |
132 | if col.endswith("_mean"): # Computing std deviation of best scores
133 | avgData[col[:-5]+"_std"] = np.std(avgData[col])
134 | avgData[col] = np.mean(avgData[col])
135 |
136 | for col in avgData:
137 | if isinstance(avgData[col], float):
138 | avgData[col] = "{:0.4f}".format(avgData[col])
139 |
140 | csvWriter.writerow(avgData)
141 |
142 | def run_combineResults(baseResDir, outDirPrefix, xlabel, baseTemplate, relResultDir, parameters):
143 | """
144 |
145 | :param baseResDir:
146 | :param outDirPrefix: Prefix to be used for directory where results will be stored
147 | :param xlabel: Dimension along which best rows have to be found. eg=Threshold, Epoch
148 | :param baseTemplate: Template structure for folder where results are stored
149 | :param relResultDir: Folder where result.csv file is present,(relative to result directory where training results are stored)
150 | :param parameters: Dictionary with key as parameter names and value as a list of parameter values which need to be combined
151 | :return:
152 | """
153 | origWorkDir = os.getcwd()
154 | os.chdir(baseResDir)
155 |
156 | currResultDir = "{outDirPrefix}_xlabel={xlabel}/{base}".format(outDirPrefix=outDirPrefix, xlabel=xlabel, base=baseTemplate)
157 | currResultDir = currResultDir.format(**parameters)
158 | Path(currResultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
159 | print("CurrResultDir:{}".format(currResultDir))
160 |
161 | ############### Combine Results ############################
162 | template = baseTemplate + "/{}/results.csv".format(relResultDir)
163 | combineResults(parameters, xlabel, currResultDir, template)
164 |
165 | ################ Plot Results ##############################
166 | # os.chdir(origWorkDir)
167 | # currResultDir = baseResDir + "/" + currResultDir
168 | # plotMetricsFromCSV(currResultDir=currResultDir, xlabel="FileNum")
169 |
170 |
171 | if __name__ == "__main__":
172 |
173 |
174 | parser = argparse.ArgumentParser(description='Combine results from different runs Ex: python -m scripts.combineResults --outDirPrefix=BestF1_AvgW --baseResDir=../results/c=NP_Coref --relResultDir=varyThresAvgWeights_f1 --xlabel=Threshold --trainObj=allWithin_allAcross --threshold=0.0 --margin=5 --modelType=avgLinear --trainFrac=0.6 --testFrac=0.3 --devFrac=0.1 --seed 1 2 3 4 5 6 7 8 9 10')
175 |
176 | # ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS###################################################
177 | # temp_config = Config()
178 | # for config_arg in temp_config.__dict__:
179 | # if config_arg == "seed": continue
180 | # def_val = temp_config.__getattribute__(config_arg)
181 | # arg_type = type(def_val) if def_val is not None else str
182 | # parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used')
183 | # #########################################################################################################
184 |
185 | parser.add_argument('--config', type=str,required=True, help='Config file')
186 | parser.add_argument('--seed', nargs='+',required=True, type=int, help="seed for random number generator")
187 | parser.add_argument('--xlabel', type=str,required=True, help='X-Label')
188 | parser.add_argument('--baseResDir', type=str, required=True,help='Directory where all result folders are stored')
189 | parser.add_argument('--suffix', type=str, default="", help="Suffix at end of each directory")
190 | parser.add_argument('--relResultDir', type=str,required=True, help='Name of folder where results.csv file is present(relative to folder where training results are stored')
191 | parser.add_argument('--outDirPrefix', type=str,required=True, help='Prefix to be used for directory where results will be stored')
192 |
193 | args = parser.parse_args()
194 | config = Config(args.config)
195 |
196 | parameters = {}
197 | parameters["d"] = config.dataDir.split("/")[-1]
198 | parameters["obj"] = config.trainObj
199 | parameters["s"] = args.seed
200 | xlabel = args.xlabel
201 |
202 | if args.suffix != "":
203 | parameters["suff"] = [args.suffix]
204 | baseTemplate = "obj={obj}_s={s}{suff}"
205 | else:
206 | baseTemplate = "obj={obj}_s={s}"
207 |
208 | run_combineResults(baseResDir=args.baseResDir, outDirPrefix=args.outDirPrefix, xlabel=args.xlabel,
209 | baseTemplate=baseTemplate, relResultDir=args.relResultDir, parameters=parameters)
210 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/env.yml:
--------------------------------------------------------------------------------
1 | name: base
2 | channels:
3 | - pytorch
4 | - lenskit
5 | - defaults
6 | dependencies:
7 | - _ipyw_jlab_nb_ext_conf=0.1.0=py36he11e457_0
8 | - _libgcc_mutex=0.1=main
9 | - _pytorch_select=0.2=gpu_0
10 | - _tflow_select=2.1.0=gpu
11 | - alabaster=0.7.10=py36h306e16b_0
12 | - anaconda=custom=py36hbbc8b67_0
13 | - anaconda-client=1.6.9=py36_0
14 | - anaconda-navigator=1.7.0=py36_0
15 | - anaconda-project=0.8.2=py36h44fb852_0
16 | - arrow-cpp=0.11.1=py36h5c3f529_1
17 | - asn1crypto=0.24.0=py36_0
18 | - astor=0.7.1=py36_0
19 | - astroid=1.6.1=py36_0
20 | - astropy=2.0.3=py36h14c3975_0
21 | - attrs=17.4.0=py36_0
22 | - babel=2.5.3=py36_0
23 | - backports=1.0=py36hfa02d7e_1
24 | - backports.shutil_get_terminal_size=1.0.0=py36hfea85ff_2
25 | - beautifulsoup4=4.6.0=py36h49b8c8c_1
26 | - bitarray=0.8.1=py36h14c3975_1
27 | - bkcharts=0.2=py36h735825a_0
28 | - blas=1.0=mkl
29 | - blaze=0.11.3=py36h4e06776_0
30 | - bokeh=0.12.13=py36h2f9c1c0_0
31 | - boto=2.48.0=py36h6e4cd66_1
32 | - bottleneck=1.2.1=py36haac1ea0_0
33 | - bzip2=1.0.6=h9a117a8_4
34 | - c-ares=1.15.0=h7b6447c_1
35 | - ca-certificates=2019.10.16=0
36 | - cairo=1.14.12=h8948797_3
37 | - certifi=2019.9.11=py36_0
38 | - chardet=3.0.4=py36h0f667ec_1
39 | - click=6.7=py36h5253387_0
40 | - cloudpickle=0.5.2=py36_1
41 | - clyent=1.2.2=py36h7e57e65_1
42 | - colorama=0.3.9=py36h489cec4_0
43 | - conda=4.7.12=py36_0
44 | - conda-build=3.4.1=py36_0
45 | - conda-env=2.6.0=h36134e3_1
46 | - conda-package-handling=1.6.0=py36h7b6447c_0
47 | - conda-verify=2.0.0=py36h98955d8_0
48 | - contextlib2=0.5.5=py36h6c84a62_0
49 | - cryptography=2.4.2=py36h1ba5d50_0
50 | - cudatoolkit=10.0.130=0
51 | - cudnn=7.6.0=cuda10.0_0
52 | - cupti=10.0.130=0
53 | - curl=7.63.0=hbc83047_1000
54 | - cycler=0.10.0=py36h93f1223_0
55 | - cython=0.27.3=py36h1860423_0
56 | - cytoolz=0.9.0=py36h14c3975_0
57 | - dask=0.16.1=py36_0
58 | - dask-core=0.16.1=py36_0
59 | - datashape=0.5.4=py36h3ad6b5c_0
60 | - dbus=1.13.2=h714fa37_1
61 | - decorator=4.2.1=py36_0
62 | - distributed=1.20.2=py36_0
63 | - docutils=0.14=py36hb0f60f5_0
64 | - entrypoints=0.2.3=py36h1aec115_2
65 | - et_xmlfile=1.0.1=py36hd6bccc3_0
66 | - expat=2.2.5=he0dffb1_0
67 | - fastcache=1.0.2=py36h14c3975_2
68 | - fastparquet=0.3.2=py36hdd07704_0
69 | - filelock=2.0.13=py36h646ffb5_0
70 | - flask-cors=3.0.3=py36h2d857d3_0
71 | - fontconfig=2.13.0=h9420a91_0
72 | - freetype=2.9.1=h8a8886c_1
73 | - fribidi=1.0.5=h7b6447c_0
74 | - get_terminal_size=1.0.0=haa9412d_0
75 | - gflags=2.2.2=he6710b0_0
76 | - glib=2.56.2=hd408876_0
77 | - glob2=0.6=py36he249c77_0
78 | - glog=0.3.5=hf484d3e_1
79 | - gmp=6.1.2=h6c8ec71_1
80 | - gmpy2=2.0.8=py36hc8893dd_2
81 | - graphite2=1.3.12=h23475e2_2
82 | - graphviz=2.40.1=h21bd128_2
83 | - gst-plugins-base=1.14.0=hbbd80ab_1
84 | - gstreamer=1.14.0=hb453b48_1
85 | - h5py=2.7.1=py36h3585f63_0
86 | - harfbuzz=1.8.8=hffaf4a1_0
87 | - hdf5=1.10.1=h9caa474_1
88 | - heapdict=1.0.0=py36_2
89 | - icu=58.2=h9c2bf20_1
90 | - idna=2.6=py36h82fb2a8_1
91 | - imageio=2.2.0=py36he555465_0
92 | - imagesize=0.7.1=py36h52d8127_0
93 | - intel-openmp=2018.0.0=hc7b2577_8
94 | - ipykernel=4.8.0=py36_0
95 | - ipython=6.2.1=py36h88c514a_1
96 | - ipython_genutils=0.2.0=py36hb52b0d5_0
97 | - ipywidgets=7.1.1=py36_0
98 | - isort=4.2.15=py36had401c0_0
99 | - ipywidgets=7.1.1=py36_0 [220/9077]
100 | - isort=4.2.15=py36had401c0_0
101 | - itsdangerous=0.24=py36h93cc618_1
102 | - jbig=2.1=hdba287a_0
103 | - jdcal=1.3=py36h4c697fb_0
104 | - jedi=0.11.1=py36_0
105 | - jinja2=2.10=py36ha16c418_0
106 | - joblib=0.14.0=py_0
107 | - jpeg=9b=h024ee3a_2
108 | - jsonschema=2.6.0=py36h006f8b5_0
109 | - jupyter=1.0.0=py36_4
110 | - jupyter_client=5.2.2=py36_0
111 | - jupyter_console=5.2.0=py36he59e554_1
112 | - jupyter_core=4.4.0=py36h7c827e3_0
113 | - jupyterlab=0.31.5=py36_0
114 | - jupyterlab_launcher=0.10.2=py36_0
115 | - keras-applications=1.0.6=py36_0
116 | - keras-preprocessing=1.0.5=py36_0
117 | - krb5=1.16.1=h173b8e3_7
118 | - lazy-object-proxy=1.3.1=py36h10fcdad_0
119 | - lenskit=0.7.0=py36h1aa3f02_0
120 | - libboost=1.67.0=h46d08c1_4
121 | - libcurl=7.63.0=h20c2e04_1000
122 | - libedit=3.1.20181209=hc058e9b_0
123 | - libevent=2.1.8=h1ba5d50_0
124 | - libffi=3.2.1=hd88cf55_4
125 | - libgcc-ng=9.1.0=hdf63c60_0
126 | - libgfortran-ng=7.2.0=h9f7466a_2
127 | - libopenblas=0.3.3=h5a2b251_3
128 | - libpng=1.6.37=hbc83047_0
129 | - libprotobuf=3.6.1=hd408876_0
130 | - libsodium=1.0.15=hf101ebd_0
131 | - libssh2=1.8.0=h1ba5d50_4
132 | - libstdcxx-ng=8.2.0=hdf63c60_1
133 | - libtiff=4.0.10=h2733197_2
134 | - libtool=2.4.6=h544aabb_3
135 | - libuuid=1.0.3=h1bed415_2
136 | - libxcb=1.13=h1bed415_1
137 | - libxml2=2.9.8=h26e45fe_1
138 | - libxslt=1.1.32=h1312cb7_0
139 | - llvmlite=0.28.0=py36hd408876_0
140 | - locket=0.2.0=py36h787c0ad_1
141 | - lxml=4.1.1=py36hf71bdeb_1
142 | - lz4-c=1.8.1.2=h14c3975_0
143 | - lzo=2.10=h49e0be7_2
144 | - markdown=3.0.1=py36_0
145 | - markupsafe=1.0=py36hd9260cd_1
146 | - mccabe=0.6.1=py36h5ad9710_1
147 | - mistune=0.8.3=py36_0
148 | - mkl=2019.4=243
149 | - mkl-service=2.3.0=py36he904b0f_0
150 | - mkl_fft=1.0.14=py36ha843d7b_0
151 | - mkl_random=1.1.0=py36hd6b4f25_0
152 | - mpc=1.0.3=hec55b23_5
153 | - mpfr=3.1.5=h11a74b3_2
154 | - mpmath=1.0.0=py36hfeacd6b_2 [165/9077]
155 | - msgpack-python=0.5.1=py36h6bb024c_0
156 | - multipledispatch=0.4.9=py36h41da3fb_0
157 | - navigator-updater=0.1.0=py36h14770f7_0
158 | - nbconvert=5.3.1=py36hb41ffb7_0
159 | - nbformat=4.4.0=py36h31c9010_0
160 | - nccl=1.3.5=cuda9.0_0
161 | - ncurses=6.1=he6710b0_1
162 | - networkx=2.1=py36_0
163 | - ninja=1.9.0=py36hfd86e86_0
164 | - nltk=3.2.5=py36h7532b22_0
165 | - nose=1.3.7=py36hcdf7029_2
166 | - notebook=5.4.0=py36_0
167 | - numba=0.43.1=py36h962f231_0
168 | - numexpr=2.6.4=py36hc4a3f9a_0
169 | - numpy-base=1.17.2=py36hde5b4d6_0
170 | - odo=0.5.1=py36h90ed295_0
171 | - olefile=0.46=py36_0
172 | - openpyxl=2.4.10=py36_0
173 | - openssl=1.1.1d=h7b6447c_3
174 | - packaging=16.8=py36ha668100_1
175 | - pandas=0.25.3=py36he6710b0_0
176 | - pandoc=1.19.2.1=hea2e7c5_1
177 | - pandocfilters=1.4.2=py36ha6701b7_1
178 | - pango=1.42.4=h049681c_0
179 | - parso=0.1.1=py36h35f843b_0
180 | - partd=0.3.8=py36h36fd896_0
181 | - patchelf=0.9=hf79760b_2
182 | - path.py=10.5=py36h55ceabb_0
183 | - pathlib2=2.3.0=py36h49efa8e_0
184 | - patsy=0.5.0=py36_0
185 | - pcre=8.42=h439df22_0
186 | - pep8=1.7.1=py36_0
187 | - pexpect=4.3.1=py36_0
188 | - pickleshare=0.7.4=py36h63277f8_0
189 | - pillow=6.1.0=py36h34e0f95_0
190 | - pixman=0.34.0=hceecf20_3
191 | - pkginfo=1.4.1=py36h215d178_1
192 | - pluggy=0.6.0=py36hb689045_0
193 | - ply=3.10=py36hed35086_0
194 | - prompt_toolkit=1.0.15=py36h17d85b1_0
195 | - psutil=5.4.3=py36h14c3975_0
196 | - ptyprocess=0.5.2=py36h69acd42_0
197 | - py=1.5.2=py36h29bf505_0
198 | - pyarrow=0.11.1=py36he6710b0_0
199 | - pycodestyle=2.3.1=py36hf609f19_0
200 | - pycosat=0.6.3=py36h0a5515d_0
201 | - pycparser=2.19=py36_0
202 | - pycrypto=2.6.1=py36h14c3975_7
203 | - pycurl=7.43.0.2=py36h1ba5d50_0
204 | - pyflakes=1.6.0=py36h7bd6a15_0
205 | - pygments=2.2.0=py36h0d3125c_0
206 | - pylint=1.8.2=py36_0
207 | - pyodbc=4.0.22=py36hf484d3e_0
208 | - pyopenssl=17.5.0=py36h20ba746_0
209 | - pyparsing=2.2.0=py36hee85983_1
210 | - pyqt=5.6.0=py36h0386399_5
211 | - pyparsing=2.2.0=py36hee85983_1 [110/9077]
212 | - pyqt=5.6.0=py36h0386399_5
213 | - pysocks=1.6.7=py36hd97a5b1_1
214 | - pytables=3.4.2=py36h3b5282a_2
215 | - pytest=3.3.2=py36_0
216 | - python=3.6.9=h265db76_0
217 | - python-dateutil=2.6.1=py36h88d3b88_1
218 | - python-snappy=0.5.4=py36he6710b0_0
219 | - pytorch=1.2.0=cuda100py36h938c94c_0
220 | - pytz=2017.3=py36h63b9c63_0
221 | - pywavelets=0.5.2=py36he602eb0_0
222 | - pyyaml=3.12=py36hafb9ca4_1
223 | - pyzmq=16.0.3=py36he2533c7_0
224 | - qt=5.6.3=h8bf5577_3
225 | - qtawesome=0.4.4=py36h609ed8c_0
226 | - qtconsole=4.3.1=py36h8f73b5b_0
227 | - qtpy=1.3.1=py36h3691cc8_0
228 | - readline=7.0=h7b6447c_5
229 | - requests=2.18.4=py36he2e5f8d_1
230 | - rope=0.10.7=py36h147e2ec_0
231 | - ruamel_yaml=0.15.35=py36h14c3975_1
232 | - scikit-image=0.13.1=py36h14c3975_1
233 | - scikit-learn=0.20.2=py36hd81dba3_0
234 | - scipy=1.2.0=py36h7c811a0_0
235 | - seaborn=0.8.1=py36hfad7ec4_0
236 | - send2trash=1.4.2=py36_0
237 | - setuptools=41.2.0=py36_0
238 | - simplegeneric=0.8.1=py36_2
239 | - singledispatch=3.4.0.3=py36h7a266c3_0
240 | - sip=4.18.1=py36h51ed4ed_2
241 | - six=1.12.0=py36_0
242 | - snappy=1.1.7=hbae5bb6_3
243 | - snowballstemmer=1.2.1=py36h6febd40_0
244 | - sortedcollections=0.5.3=py36h3c761f9_0
245 | - sortedcontainers=1.5.9=py36_0
246 | - sphinx=1.6.6=py36_0
247 | - sphinxcontrib=1.0=py36h6d0f590_1
248 | - sphinxcontrib-websupport=1.0.1=py36hb5cb234_1
249 | - spyder=3.2.6=py36_0
250 | - sqlalchemy=1.2.1=py36h14c3975_0
251 | - sqlite=3.29.0=h7b6447c_0
252 | - statsmodels=0.8.0=py36h8533d0b_0
253 | - sympy=1.1.1=py36hc6d1c1c_0
254 | - tblib=1.3.2=py36h34cf8b6_0
255 | - tensorflow=1.12.0=gpu_py36he68c306_0
256 | - tensorflow-base=1.12.0=gpu_py36h8e0ae2d_0
257 | - terminado=0.8.1=py36_1
258 | - testpath=0.3.1=py36h8cadb63_0
259 | - thrift=0.11.0=py36hf484d3e_0
260 | - thrift-cpp=0.11.0=h02b749d_3
261 | - tk=8.6.8=hbc83047_0
262 | - toolz=0.9.0=py36_0
263 | - torchvision=0.4.0=cuda100py36hecfc37a_0
264 | - torchvision-cpu=0.2.1=py36_1
265 | - tornado=4.5.3=py36_0
266 | - traitlets=4.3.2=py36h674d592_0
267 | - typing=3.6.2=py36h7da032a_0
268 | - traitlets=4.3.2=py36h674d592_0 [55/9077]
269 | - typing=3.6.2=py36h7da032a_0
270 | - unicodecsv=0.14.1=py36ha668878_0
271 | - unixodbc=2.3.4=hc36303a_1
272 | - urllib3=1.22=py36hbe7ace6_0
273 | - wcwidth=0.1.7=py36hdf4376a_0
274 | - webencodings=0.5.1=py36h800622e_1
275 | - werkzeug=0.14.1=py36_0
276 | - wheel=0.33.6=py36_0
277 | - widgetsnbextension=3.1.0=py36_0
278 | - wrapt=1.10.11=py36h28b7045_0
279 | - xlrd=1.1.0=py36h1db9f0c_1
280 | - xlsxwriter=1.0.2=py36h3de1aca_0
281 | - xlwt=1.3.0=py36h7b00a1f_0
282 | - xz=5.2.4=h14c3975_4
283 | - yaml=0.1.7=had09818_2
284 | - zeromq=4.2.2=hbedb6e5_2
285 | - zict=0.1.3=py36h3a3bf81_0
286 | - zlib=1.2.11=h7b6447c_3
287 | - zstd=1.3.7=h0b5b093_0
288 | - pip:
289 | - absl-py==0.7.0
290 | - allennlp==0.7.1
291 | - aws-xray-sdk==0.95
292 | - awscli==1.16.59
293 | - bleach==1.5.0
294 | - boto3==1.7.4
295 | - botocore==1.12.49
296 | - bz2file==0.98
297 | - cffi==1.11.2
298 | - conllu==0.11
299 | - cookies==2.2.1
300 | - cymem==2.0.2
301 | - deprecation==2.0.6
302 | - dill==0.2.8.2
303 | - docker==3.5.1
304 | - docker-pycreds==0.3.0
305 | - ecdsa==0.13
306 | - editdistance==0.5.2
307 | - fasttext==0.8.22
308 | - flaky==3.4.0
309 | - flask==0.12.4
310 | - ftfy==5.5.0
311 | - future==0.17.1
312 | - gast==0.2.2
313 | - gensim==3.4.0
314 | - gevent==1.3.6
315 | - gputil==1.4.0
316 | - greenlet==0.4.15
317 | - grpcio==1.18.0
318 | - gurobipy==8.1.1
319 | - hdbscan==0.8.18
320 | - html5lib==0.9999999
321 | - jmespath==0.9.3
322 | - jsondiff==1.1.1
323 | - jsonnet==0.10.0
324 | - jsonpickle==1.0
325 | - jsonnet==0.10.0
326 | - jsonpickle==1.0
327 | - kiwisolver==1.0.1
328 | - matplotlib==2.2.3
329 | - mock==2.0.0
330 | - moto==1.3.4
331 | - msgpack==0.5.6
332 | - msgpack-numpy==0.4.3.2
333 | - murmurhash==1.0.1
334 | - numpy==1.15.4
335 | - numpydoc==0.8.0
336 | - overrides==1.9
337 | - parsimonious==0.8.0
338 | - pbr==5.1.1
339 | - pip==19.0.1
340 | - plac==0.9.6
341 | - preshed==2.0.1
342 | - protobuf==3.3.0
343 | - py3-ortools==6.4.4495
344 | - pyaml==18.11.0
345 | - pyasn1==0.4.4
346 | - pybind11==2.2.4
347 | - pycryptodome==3.7.0
348 | - pyenchant==2.0.0
349 | - pyhocon==0.3.50
350 | - pympler==0.5
351 | - pynvml==8.0.3
352 | - python-graphviz==0.10.1
353 | - python-jose==2.0.2
354 | - regex==2018.1.10
355 | - responses==0.10.4
356 | - rsa==3.4.2
357 | - s3transfer==0.1.13
358 | - sklearn==0.0
359 | - smart-open==1.5.7
360 | - spacy==2.0.16
361 | - sqlparse==0.2.4
362 | - tensorboard==1.7.0
363 | - tensorboardx==1.2
364 | - tensorflow-gpu==1.7.0
365 | - tensorflow-hub==0.2.0
366 | - termcolor==1.1.0
367 | - thinc==6.12.0
368 | - torch==0.4.1
369 | - torchsummary==1.5.1
370 | - tqdm==4.28.1
371 | - ujson==1.35
372 | - unidecode==1.0.22
373 | - websocket-client==0.54.0
374 | - xmltodict==0.11.0
375 |
--------------------------------------------------------------------------------
/src/eval/threshold.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2019 University of Massachusetts Amherst.
3 | This file is part of "expLinkage"
4 | http://github.com/iesl/expLinkage
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 | http://www.apache.org/licenses/LICENSE-2.0
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | """
15 |
16 | import time
17 | import numpy as np
18 | import matplotlib
19 | matplotlib.use('Agg')
20 | import matplotlib.pyplot as plt
21 | from pathlib import Path
22 |
23 |
24 | # Recursive search
25 | # TODO: Change to just work for F1. Remove f1ToEval argument
26 | def choose_threshold(trainer, infMethod, epoch="0", canopies=None):
27 |
28 | f1ToEval="f1"
29 | if infMethod != "connComp" and (not infMethod.endswith("@t")):
30 | trainer.logger.info("Can not choose threshold for infMethod = {}".format(infMethod))
31 | return 0.
32 |
33 | printLog = True
34 | if canopies is None:
35 | if len(trainer.devCanopies) != 0:
36 | canopies = trainer.devCanopies
37 | else:
38 | canopies = trainer.trainCanopies
39 | else:
40 | pass
41 |
42 | # Precison , recall and f1 to use when finding bestTHreshold. Alaternatively, we could use "connComp_muc_precision" etc
43 | if f1ToEval == "muc_f1":
44 | recallToUse = "{}_muc_recall".format(infMethod)
45 | precisionToUse = "{}_muc_precision".format(infMethod)
46 | elif f1ToEval == "f1":
47 | recallToUse = "{}_recall".format(infMethod)
48 | precisionToUse = "{}_precision".format(infMethod)
49 | else:
50 | recallToUse = None
51 | precisionToUse = None
52 | raise Exception("Invalid f1ToUse={} to choose threshold".format(f1ToEval))
53 |
54 | start = time.time()
55 | trainer.logger.info("==" * 20 + "Beginning choosing threshold for method ={}".format(infMethod))
56 |
57 | currThreshold = 0.128
58 | precision = 0
59 | allScores = {"{}_{}".format(infMethod, metric): {} for metric in ["precision","recall", "f1", "muc_precision","muc_recall", "muc_f1"]}
60 | f1Metric = "{}_{}".format(infMethod, f1ToEval)
61 |
62 | while precision != 1:
63 | scores = trainer.evalFunc(config=trainer.config, model=trainer.model, canopies=canopies,
64 | threshDict={infMethod:currThreshold}, inferenceMethods=[infMethod], metricsForEval=f1ToEval)
65 |
66 | precision = scores[precisionToUse][0]
67 | for metric in allScores:
68 | allScores[metric][currThreshold] = scores[metric][0] if metric in scores else 0
69 |
70 | if printLog: trainer.logger.info("Precision:{}\t Threshold:{:.3f}".format(precision, currThreshold))
71 | if trainer.config.outDisSim: # Decreasing threshold to get better precision as model outputs distance
72 | if currThreshold < 0:
73 | currThreshold *= 2 # It is a negative number and making it smaller by multiplying it by 2
74 | elif currThreshold > 0.0001:
75 | currThreshold /= 2 # It is a positive number and making it smaller by dividing it by 2
76 | elif currThreshold == 0.0:
77 | currThreshold = -0.128 # Assign a small negative value to move away from zero
78 | else:# Switch over from very small positive to very small negative to continue making threshold smaller
79 | currThreshold = 0.
80 |
81 | else: # Increasing threshold to get better precision as model outputs similarity
82 | if currThreshold > 0: # It is already positive and make it larger by multiplying by 2
83 | currThreshold *= 2
84 | elif currThreshold < -0.0001: # If it is negative then make it larger by dividing by 2
85 | currThreshold /= 2
86 | elif currThreshold == 0.0:
87 | currThreshold = 0.128 # Assign a small positive value to move away from zero
88 | else: # Switch over from very small negative to very small positive to continue making threshold larger
89 | currThreshold = 0.
90 |
91 |
92 |
93 | bestRecall = -1
94 | theshForBestRecall = None
95 | for threshold in allScores[recallToUse]:
96 | if allScores[recallToUse][threshold] > bestRecall:
97 | bestRecall = allScores[recallToUse][threshold]
98 | theshForBestRecall = threshold
99 |
100 | if printLog: trainer.logger.info(" Best Recall:{:.3f}\t Threshold:{:.3f}".format(bestRecall, theshForBestRecall))
101 |
102 | if bestRecall != 1:
103 | currRecall = bestRecall
104 | currThreshold = theshForBestRecall
105 | while currRecall != 1:
106 | scores = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=canopies, threshDict={infMethod:currThreshold},
107 | inferenceMethods=[infMethod], metricsForEval=f1ToEval)
108 | for metric in allScores:
109 | allScores[metric][currThreshold] = scores[metric][0] if metric in scores else 0
110 |
111 | currRecall = allScores[recallToUse][currThreshold]
112 | if printLog: trainer.logger.info("Recall:{:.3f}\t Threshold:{:.3f}".format(currRecall, currThreshold))
113 |
114 | if trainer.config.outDisSim: # Increasing threshold to get better recall as model outputs distance
115 | if currThreshold > 0: # If positive already, the multiply by 2 to make it larger
116 | currThreshold *= 2
117 | elif currThreshold < -0.0001: # It negative then divide by 2 to make it larger
118 | currThreshold /= 2
119 | elif currThreshold == 0: # Assign a small positive value to move away from zero
120 | currThreshold = 0.128
121 | else: # It too small negative, then switch over from negative to positive to continue making it larger
122 | currThreshold = 0.
123 | else: # Decrease threshold as it gives better recall
124 | if currThreshold < 0: # If negative, then making threshold smaller by n[multiplying it by 2
125 | currThreshold *= 2
126 | elif currThreshold > 0.0001: # If positive, then making threshold smaller by dividing it by 2
127 | currThreshold /= 2
128 | elif currThreshold == 0.0: # Assign a small negative value to move away from zero
129 | currThreshold = -0.128
130 | else: # It too small positive, then switch over from positive to negative to continue making it smaller
131 | currThreshold = 0.
132 |
133 | ''' Each time, I find threshold values between which the f1 score peaks. Then I try threshold values between those bounds
134 | and repeat the same procedure until: F1 at t1,t2 and (t1+t2)/2 is all same when rounded by 2 decimals or I have done this
135 | recursive search for more than 4 times
136 | '''
137 |
138 | bestF1 = -1
139 | threshForBestF1 = None
140 | for threshold in allScores[f1Metric]:
141 | if allScores[f1Metric][threshold] > bestF1:
142 | bestF1 = allScores[f1Metric][threshold]
143 | threshForBestF1 = threshold
144 |
145 | if threshForBestF1 == 1:
146 | return threshForBestF1
147 |
148 | # Try random values between thresh for best recall and thresh for best precision
149 | if printLog: trainer.logger.info("AllScores:{}".format(allScores))
150 |
151 | t1 = sorted(allScores[f1Metric].keys())[0]
152 | t2 = sorted(allScores[f1Metric].keys())[-1]
153 | numIntermediateThresh = 50
154 | thresholdValsToTry = np.arange(t1, t2, (t2 - t1) / numIntermediateThresh)
155 | if printLog: trainer.logger.info("Trying some additional threshold values between largest and smallest tried so far:{}".format(thresholdValsToTry))
156 | for thresh in thresholdValsToTry:
157 | scores = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=canopies, threshDict={infMethod:thresh},
158 | inferenceMethods=[infMethod], metricsForEval=f1ToEval)
159 | for metric in allScores:
160 | allScores[metric][thresh] = scores[metric][0] if metric in scores else 0
161 |
162 | numRecSearch = 0
163 | while numRecSearch <= 6:
164 | numRecSearch += 1
165 | thresholdVals = sorted(list(allScores[f1Metric].keys()))
166 | if len(thresholdVals) == 1:
167 | if printLog: trainer.logger.info("Best threshold found in just 1 attempt:{}\t{}".format(thresholdVals[0], allScores[f1Metric]))
168 | break
169 |
170 | assert len(thresholdVals) >= 2
171 |
172 | bestThreshold = thresholdVals[0]
173 | for threshTried in thresholdVals: # Choose threshold that gave best F1 on dev set
174 | if printLog: trainer.logger.info("{}\tThreshold:{:.3f}\tF1:{:.6f}".format(numRecSearch, threshTried, allScores[f1Metric][threshTried]))
175 | if allScores[f1Metric][threshTried] >= allScores[f1Metric][bestThreshold]:
176 | bestThreshold = threshTried
177 |
178 | lowerThreshold = thresholdVals[0]
179 | upperThreshold = thresholdVals[-1]
180 |
181 | prevThreshold = None
182 | for threshTried in thresholdVals:
183 | if prevThreshold == bestThreshold:
184 | upperThreshold = threshTried # Threshold immediately AFTER the threshold that gives best F1
185 |
186 | if threshTried == bestThreshold:
187 | # Threshold immediately BEFORE the threshold that gives best F1
188 | lowerThreshold = prevThreshold if prevThreshold is not None else bestThreshold
189 |
190 | prevThreshold = threshTried
191 |
192 | # Push upperThreshold to as large as possible such that it still stays immediately next to best F1
193 | thresholdVals = sorted(thresholdVals)
194 | for ctr, threshold in enumerate(thresholdVals):
195 | if threshold < upperThreshold: continue
196 | if allScores[f1Metric][upperThreshold] == allScores[f1Metric][bestThreshold]:
197 | if ctr < len(thresholdVals) - 1:
198 | upperThreshold = thresholdVals[ctr + 1]
199 | else:
200 | break
201 |
202 | # Push lowerThreshold to as large as possible such that it still stays immediately before best F1
203 | # thresholdVals = sorted(thresholdVals, reverse=True)
204 | # for ctr,threshold in enumerate(thresholdVals):
205 | # if threshold > lowerThreshold: continue
206 | # if allScores[f1ToUse][lowerThreshold] == allScores[f1ToUse][bestThreshold]:
207 | # if ctr < len(thresholdVals)-1:
208 | # lowerThreshold = thresholdVals[ctr+1]
209 | # else:
210 | # break
211 |
212 | if printLog: trainer.logger.info("Upper Threshold:{:.3f} Lower Threshold:{:.3f}".format(upperThreshold, lowerThreshold))
213 |
214 | numIntermediateThresh = int(20 / numRecSearch)
215 | thresholdValsToTry = np.arange(lowerThreshold, upperThreshold,
216 | (upperThreshold - lowerThreshold) / numIntermediateThresh)
217 | if printLog: trainer.logger.info("Threshold Values to try:{}".format(["{:.3f}".format(x) for x in thresholdValsToTry]))
218 | for currThreshold in thresholdValsToTry:
219 | scores = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=canopies, threshDict={infMethod:currThreshold},
220 | inferenceMethods=[infMethod], metricsForEval=f1ToEval)
221 | for metric in allScores:
222 | allScores[metric][currThreshold] = scores[metric][0] if metric in scores else 0
223 | if currThreshold not in allScores[f1Metric]:
224 | pass
225 |
226 | midThreshold = (lowerThreshold + upperThreshold) / 2
227 | if printLog: trainer.logger.info("Mid Threshold:{:.3f}".format(midThreshold))
228 | if midThreshold not in allScores[f1Metric]:
229 | scores = trainer.evalFunc(config=trainer.config, model=trainer.model, canopies=canopies, threshDict={infMethod:midThreshold},
230 | inferenceMethods=[infMethod], metricsForEval=f1ToEval)
231 | for metric in allScores:
232 | allScores[metric][midThreshold] = scores[metric][0] if metric in scores else 0
233 |
234 |
235 | if (round(allScores[f1Metric][upperThreshold], 3) == round(allScores[f1Metric][lowerThreshold], 3)) \
236 | and (round(allScores[f1Metric][midThreshold], 3) == round(allScores[f1Metric][lowerThreshold], 3)):
237 | trainer.logger.info("Stopping as F1 at upperThreshold, lowerThreshold and midThreshold is same upto 3 decimal places")
238 | break
239 |
240 | # Choose bestThreshold from all the threshold values tried so far
241 | thresholdVals = sorted(list(allScores[f1Metric].keys()))
242 | bestThreshold = thresholdVals[0]
243 | for threshTried in allScores[f1Metric]: # Choose threshold that gave best F1 on dev set
244 | if allScores[f1Metric][threshTried] >= allScores[f1Metric][bestThreshold]:
245 | bestThreshold = threshTried
246 |
247 | end = time.time()
248 | threshTried = sorted(list(allScores[f1Metric].keys()))
249 | if printLog: trainer.logger.info("Tried {} threshold values. Threshold tried:{}".format(len(allScores[f1Metric]), ",".join(["{:.3f}\t{:.6f}\n".format(x, allScores[f1Metric][x]) for x in threshTried])))
250 | trainer.logger.info("Time taken for choosing threshold={:.3f} with {} = {:.4f} is {:.3f}".format(bestThreshold, f1Metric, allScores[f1Metric][bestThreshold], end - start))
251 | trainer.logger.info("==" * 20 + "\n")
252 |
253 | Path(trainer.config.resultDir + "/chooseThresh").mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present
254 |
255 | for metric in [f1Metric]:
256 | plt.clf()
257 | X = sorted(allScores[metric].keys())
258 | Y = [allScores[metric][x] for x in X]
259 | plt.plot(X, Y, 'ro-')
260 | plt.plot([bestThreshold], [allScores[metric][bestThreshold]], 'b*')
261 | plt.xlabel("Threshold")
262 | plt.ylabel("{} {}".format(infMethod, metric))
263 | plt.grid()
264 | plt.title("{} vs Threshold".format(metric))
265 | plt.savefig(trainer.config.resultDir + "/chooseThresh/{}_{}_{}.png".format(infMethod, metric, epoch))
266 | plt.close()
267 |
268 | return bestThreshold
269 |
--------------------------------------------------------------------------------