├── src ├── eval │ ├── __init__.py │ ├── evalF1.py │ ├── evalDendPurity.py │ ├── evalMUCF1.py │ ├── finalEval.py │ └── threshold.py ├── trainer │ ├── __init__.py │ ├── BaseTrainer.py │ ├── train_vect_data.py │ ├── train_pair_feat.py │ └── scipy_perceptron.py ├── utils │ ├── __init__.py │ ├── mds.py │ ├── fixNPCorefDataFormat.py │ ├── process_aminer_stats.py │ ├── processADANA.py │ ├── createNPDataset.py │ ├── projectFaces.py │ ├── processRexa.py │ ├── Config.py │ ├── create_synth_dataset.py │ └── combineResults.py ├── hier_clust │ ├── __init__.py │ ├── random_split.py │ └── recursive_sparsest_cut.py └── models │ ├── __init__.py │ ├── templateClassifier.py │ ├── linearClassifier.py │ └── mahalabonis.py ├── bin ├── revSyncResults.sh ├── run.sh ├── syncResults.sh ├── runDiffSeed.sh ├── setup.sh └── compileResults.sh ├── NOTICE.txt ├── config ├── synth │ └── spiral.json ├── rexa │ ├── linkage_0.json │ ├── linkage_min.json │ ├── triplet.json │ ├── linkage_auto.json │ ├── linkage_max.json │ ├── allWithin_allAcross.json │ ├── bestWithin_bestAcross.json │ └── mstWithin_bestAcross.json ├── NP_Coref │ ├── triplet.json │ ├── linkage_0.json │ ├── linkage_auto.json │ ├── linkage_max.json │ ├── linkage_min.json │ ├── allWithin_allAcross.json │ ├── mstWithin_bestAcross.json │ └── bestWithin_bestAcross.json ├── authorCoref │ ├── linkage_0.json │ ├── triplet.json │ ├── linkage_auto.json │ ├── linkage_max.json │ ├── linkage_min.json │ ├── allWithin_allAcross.json │ ├── bestWithin_bestAcross.json │ └── mstWithin_bestAcross.json └── faceData_20.tsv │ ├── triplet.json │ ├── linkage_auto.json │ ├── linkage_max.json │ ├── linkage_min.json │ ├── linkage_0.json │ ├── allWithin_allAcross.json │ ├── mstWithin_minAcross.json │ └── minWithin_minAcross.json ├── .gitignore ├── resources └── line_styles.json ├── README.md ├── LICENSE └── env.yml /src/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/hier_clust/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bin/revSyncResults.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | dir=$1 3 | while true 4 | do 5 | rsync ../$dir/ -avzi blake:/iesl/canvas/nishantyadav/clustering/$dir/ 6 | sleep 60 7 | done 8 | -------------------------------------------------------------------------------- /bin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -xu 4 | 5 | allCommand= 6 | while [ "$#" -gt 0 ]; 7 | do 8 | allCommand=" $allCommand $1 " 9 | shift 10 | done 11 | 12 | $allCommand 13 | -------------------------------------------------------------------------------- /bin/syncResults.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xu 3 | 4 | dir=$1 5 | time=$2 6 | while true 7 | do 8 | rsync -avzi blake:/iesl/canvas/nishantyadav/clustering/$dir/ ../$dir/ 9 | sleep $time 10 | done 11 | -------------------------------------------------------------------------------- /bin/runDiffSeed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -xu 4 | startSeed=$1 5 | shift 6 | endSeed=$1 7 | shift 8 | command=$1 9 | 10 | for seed in $(seq $startSeed $endSeed) 11 | do 12 | echo $seed 13 | $command --seed=$seed 14 | done -------------------------------------------------------------------------------- /bin/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #set -xu 4 | 5 | export ROOT_DIR=`pwd` 6 | export PYTHONPATH=$ROOT_DIR/src:$PYTHONPATH 7 | export XCLUSTER_ROOT=$ROOT_DIR/../xcluster 8 | export XCLUSTER_JARPATH=$XCLUSTER_ROOT/target/xcluster-0.1-SNAPSHOT-jar-with-dependencies.jar 9 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | expLinkage 2 | 3 | This software is Copyright (C) 2019 University of Massachusetts 4 | Amherst, College of Information and Computer Sciences, and is licensed under the 5 | terms of the Apache License, Version 2.0 (see LICENSE.txt) or (at your option) any subsequent version. 6 | 7 | The license is approved by the Open Source Initiative, and is available 8 | from their website at http://www.opensource.org. 9 | -------------------------------------------------------------------------------- /config/synth/spiral.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/spiralSmallRotated", 13 | "dataDir" : "../data/spiralSmallRotated", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 2, 22 | "margin" : 1, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 1.0, 37 | "testFrac" : 0.0, 38 | "devFrac" : 0.0, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink@t"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "linear", 46 | "model" : "AvgPerceptron", 47 | "inputDim" : 2, 48 | "outDisSim" : true, 49 | "lr" : 0.1, 50 | "l2Alpha" : 0.1, 51 | "alphaLr" : 0.01, 52 | "alphaInitMu" : 0.0, 53 | "alphaInitSigma": 0.1, 54 | "idenInit" : false 55 | 56 | } -------------------------------------------------------------------------------- /config/rexa/linkage_0.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_0", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.1, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/rexa/linkage_min.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_min", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 42 | "metricsForEval" : ["f1", "dendPurity"], 43 | 44 | "modelType" : "avgLinear", 45 | "inputDim" : 14, 46 | "outDisSim" : true, 47 | "lr" : 0.1, 48 | "l2Alpha" : 0.01, 49 | "alphaLr" : 0.2, 50 | "alphaInitMu" : 0.0, 51 | "alphaInitSigma": 0.1, 52 | "idenInit" : false 53 | 54 | } -------------------------------------------------------------------------------- /config/rexa/triplet.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "triplet", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 100, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.1, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/rexa/linkage_auto.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_auto", 19 | "trainModel" : true, 20 | "trainAlpha" : true, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.1, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/rexa/linkage_max.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_max", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.1, 49 | "l2Alpha" : 0.005, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/triplet.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "triplet", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 100, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.1, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/linkage_0.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_0", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.01, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/linkage_auto.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_auto", 19 | "trainModel" : true, 20 | "trainAlpha" : true, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.01, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/linkage_max.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_max", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.005, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/linkage_min.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_min", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.01, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : -5.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/linkage_0.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_0", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/triplet.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "triplet", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 100, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/linkage_auto.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_auto", 19 | "trainModel" : true, 20 | "trainAlpha" : true, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/linkage_max.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_max", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/linkage_min.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_min", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/rexa/allWithin_allAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "allWithin_allAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/rexa/bestWithin_bestAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "bestWithin_bestAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.1, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/rexa/mstWithin_bestAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "rexa", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/rexa", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "mstWithin_bestAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 500, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.375, 37 | "testFrac" : 0.375, 38 | "devFrac" : 0.25, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 14, 47 | "outDisSim" : true, 48 | "lr" : 0.1, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.2, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/allWithin_allAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "allWithin_allAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.1, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/mstWithin_bestAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "mstWithin_bestAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.02, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.05, 51 | "alphaInitMu" : -5.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/NP_Coref/bestWithin_bestAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "NP_Coref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/NP_Coref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "bestWithin_bestAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 100, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 102, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.05, 51 | "alphaInitMu" : -5.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/allWithin_allAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "allWithin_allAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.005, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/bestWithin_bestAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "bestWithin_bestAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/authorCoref/mstWithin_bestAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "authorCoref", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "", 13 | "dataDir" : "../data/authorCoref", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "mstWithin_bestAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 0.0, 22 | "margin" : 2.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": true, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 200, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 1000, 32 | "epochToWrite" : 1000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.6, 37 | "testFrac" : 0.3, 38 | "devFrac" : 0.1, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "avgLinear", 46 | "inputDim" : 8, 47 | "outDisSim" : true, 48 | "lr" : 0.05, 49 | "l2Alpha" : 0.001, 50 | "alphaLr" : 0.005, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/triplet.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "triplet", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 100, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.1, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/linkage_auto.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_auto", 19 | "trainModel" : true, 20 | "trainAlpha" : true, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/linkage_max.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_max", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/linkage_min.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_min", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/linkage_0.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces_linkage_0", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "linkage_0", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/allWithin_allAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "allWithin_allAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.1, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/mstWithin_minAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "mstWithin_minAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /config/faceData_20.tsv/minWithin_minAcross.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "config_name": "faces", 4 | 5 | "cuda" : true, 6 | "seed" : 1234, 7 | 8 | "mode" : "train", 9 | "resultDir" : "auto", 10 | "newDirSuffix" : "", 11 | 12 | "clusterFile" : "../data/faceData_20.tsv", 13 | "dataDir" : "../data/faceData_20.tsv", 14 | "logFile" : "logFile.txt", 15 | "bestModel" : "", 16 | "logConsole" : true, 17 | 18 | "trainObj" : "minWithin_minAcross", 19 | "trainModel" : true, 20 | "trainAlpha" : false, 21 | "threshold" : 100.0, 22 | "margin" : 10.0, 23 | "normalizeLoss" : false, 24 | "normExpLinkLoss": false, 25 | "trainExpLink" : true, 26 | "scaleDist" : false, 27 | "numErrorTriplet": 0, 28 | 29 | "numEpoch" : 1000, 30 | "numEpToAvg" : 10, 31 | "epochToEval" : 10000, 32 | "epochToWrite" : 10000, 33 | "epsilon" : 0.0001, 34 | "makeScorePlots": false, 35 | 36 | "trainFrac" : 0.35, 37 | "testFrac" : 0.35, 38 | "devFrac" : 0.3, 39 | "shuffleData" : true, 40 | 41 | 42 | "inferenceMethods" : ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t","linkage_auto","linkage_auto@t","random","recSparsest"], 43 | "metricsForEval" : ["f1", "dendPurity"], 44 | 45 | "modelType" : "maha", 46 | "inputDim" : 20, 47 | "outDisSim" : true, 48 | "lr" : 0.001, 49 | "l2Alpha" : 0.01, 50 | "alphaLr" : 0.01, 51 | "alphaInitMu" : 0.0, 52 | "alphaInitSigma": 0.1, 53 | "idenInit" : false 54 | 55 | } -------------------------------------------------------------------------------- /bin/compileResults.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #sh bin/compileResults.sh rexa 11 30 "" bestWithin_bestAcross mstWithin_bestAcross linkage_min allWithin_allAcross triplet linkage_0 linkage_max linkage_auto 3 | 4 | set -xu 5 | 6 | ############################################### FOR COMPARING DIFFERENT OBJECTIVES ON VARYYING SEEDS ######################################################## 7 | 8 | res_root=../results_refactor 9 | 10 | data=$1 11 | shift 12 | 13 | seedStart=$1 14 | shift 15 | 16 | seedEnd=$1 17 | shift 18 | 19 | suffix=$1 20 | shift 21 | 22 | seeds=$(seq $seedStart $seedEnd) 23 | 24 | allObj= 25 | while [ "$#" -gt 0 ]; 26 | do 27 | obj=$1 28 | shift 29 | allObj=" $allObj $obj " 30 | 31 | python -m utils.combineResults --outDirPrefix=BestDevThresh --baseResDir=$res_root/d\=$data --relResultDir=BestDevThresh --xlabel=Threshold --config=config/$data/$obj.json --seed $seeds --suffix=$suffix 32 | # python -m utils.combineResults --outDirPrefix=BestTestThresh --baseResDir=$res_root/d\=$data --relResultDir=BestTestThresh --xlabel=Threshold --config=config/$data/$obj.json --seed $seeds --suffix=$suffix 33 | 34 | done 35 | 36 | python -m utils.compareMethods --baseResDir=$res_root/d\=$data --outDirPrefix=BestDevThresh --trainObj $allObj --xlabel=Threshold --seed $seeds --suffix=$suffix 37 | #python -m utils.compareMethods --baseResDir=$res_root/d\=$data --outDirPrefix=BestTestThresh --trainObj $allObj --xlabel=Threshold --seed $seeds --suffix=$suffix 38 | 39 | ##################################################################################################################################################################### 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */.DS_Store 2 | /.DS_Store 3 | .idea/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /src/utils/mds.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | from sklearn.datasets import load_digits 17 | from sklearn.manifold import MDS 18 | import random, itertools 19 | 20 | from utils.plotting import plot_clusters 21 | 22 | def runMDSDummy(): 23 | X, _ = load_digits(return_X_y=True) 24 | embedding = MDS(n_components=2) 25 | X_transformed = embedding.fit_transform(X[:100]) 26 | 27 | pointToCluster = {} 28 | for x in X_transformed: 29 | pointToCluster[tuple(x)] = random.randint(0, 2) 30 | 31 | edges = [] 32 | for p1, p2 in itertools.combinations(pointToCluster, 2): 33 | if pointToCluster[p1] == pointToCluster[p2]: 34 | edges += [(p1[0], p1[1], p2[0], p2[1])] 35 | 36 | plot_clusters(pointToCluster=pointToCluster, filename='../results/testMDS.png', edgeList=edges) 37 | 38 | def runMDS(simMatrix, pidToCluster,filename): 39 | 40 | embedding = MDS(n_components=2,dissimilarity='precomputed') 41 | X_transformed = embedding.fit_transform(simMatrix) 42 | 43 | pointToCluster = {} 44 | for pid,x in enumerate(X_transformed): 45 | pointToCluster[tuple(x)] = pidToCluster[pid] 46 | 47 | numPoints = len(pidToCluster) 48 | edges = [] 49 | for p1,p2 in itertools.combinations( pointToCluster, 2): 50 | if pointToCluster[p1] == pointToCluster[p2]: 51 | edges += [(p1[0],p1[1],p2[0],p2[1])] 52 | 53 | plot_clusters(pointToCluster=pointToCluster, filename='../results/testMDS_{}.png'.format(filename), edgeList=edges) 54 | plot_clusters(pointToCluster=pointToCluster, filename='../results/testMDSWithout_{}.png'.format(filename)) 55 | 56 | 57 | if __name__ == "__main__": 58 | 59 | 60 | # runMDS() 61 | pass -------------------------------------------------------------------------------- /resources/line_styles.json: -------------------------------------------------------------------------------- 1 | { 2 | "test": { 3 | "style": { 4 | "label": "Test", 5 | "color": "orange", 6 | "mec": "orange", 7 | "marker": "*", 8 | "ls": "-", 9 | "lw": 2, 10 | "mew": 2, 11 | "ms": 6, 12 | "elinewidth": 0.1 13 | }, 14 | "fill style": { 15 | "color": "orange", 16 | "alpha": 0.3 17 | } 18 | }, 19 | "dev": { 20 | "style": { 21 | "label": "Dev", 22 | "color": "salmon", 23 | "mec": "salmon", 24 | "marker": "*", 25 | "ls": "-", 26 | "lw": 2, 27 | "mew": 2, 28 | "ms": 6, 29 | "elinewidth": 0.1 30 | }, 31 | "fill style": { 32 | "color": "salmon", 33 | "alpha": 0.3 34 | } 35 | }, 36 | "dev_euclid": { 37 | "style": { 38 | "label": "Dev(Euclidean)", 39 | "color": "lightseagreen", 40 | "mec": "lightseagreen", 41 | "marker": "*", 42 | "ls": "-", 43 | "lw": 2, 44 | "mew": 2, 45 | "ms": 6, 46 | "elinewidth": 0.1 47 | }, 48 | "fill style": { 49 | "color": "lightseagreen", 50 | "alpha": 0.3 51 | } 52 | }, 53 | "test_euclid": { 54 | "style": { 55 | "label": "Test(Euclidean)", 56 | "color": "olive", 57 | "mec": "olive", 58 | "marker": "*", 59 | "ls": "-", 60 | "lw": 2, 61 | "mew": 2, 62 | "ms": 6, 63 | "elinewidth": 0.1 64 | }, 65 | "fill style": { 66 | "color": "olive", 67 | "alpha": 0.3 68 | } 69 | }, 70 | "train": { 71 | "style": { 72 | "label": "Train", 73 | "color": "maroon", 74 | "mec": "maroon", 75 | "marker": "o", 76 | "ls": "-", 77 | "lw": 2, 78 | "mew": 2, 79 | "ms": 6, 80 | "elinewidth": 0.1 81 | }, 82 | "fill style": { 83 | "color": "red", 84 | "alpha": 0.3 85 | } 86 | }, 87 | "train_euclid": { 88 | "style": { 89 | "label": "Train(Euclidean)", 90 | "color": "darkgreen", 91 | "mec": "darkgreen", 92 | "marker": "o", 93 | "ls": "-", 94 | "lw": 2, 95 | "mew": 2, 96 | "ms": 6, 97 | "elinewidth": 0.1 98 | }, 99 | "fill style": { 100 | "color": "darkgreen", 101 | "alpha": 0.3 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/utils/fixNPCorefDataFormat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | from pathlib import Path 17 | 18 | def fixDataFormat(origDir ,newDir): 19 | 20 | Path(newDir).mkdir(parents=True, exist_ok=True) 21 | 22 | canopyList = sorted([str(f).split("/")[-1] for f in Path(origDir).glob("*") if f.is_dir()]) 23 | 24 | print("CanopyList:",canopyList) 25 | 26 | for canopy in canopyList: 27 | Path("{}/{}".format(newDir,canopy)).mkdir(parents=True, exist_ok=True) 28 | 29 | with open("{}/{}/pairFeatures.csv".format(origDir,canopy),'r') as origFile: 30 | with open("{}/{}/pairFeatures.csv".format(newDir,canopy),'w') as newFile: 31 | for line in origFile: 32 | lineV = line.strip().split(",") 33 | if lineV[-1] == "+": 34 | lineV[-1] = "1" 35 | elif lineV[-1] == "-": 36 | lineV[-1] = "0" 37 | else: 38 | raise Exception("Invalid last token ..",lineV) 39 | 40 | processedLine = ",".join([str(v) for v in lineV[1:]]) # Exclude doc number 41 | 42 | newFile.write(processedLine +"\n") 43 | 44 | with open("{}/{}/gtClusters.tsv".format(origDir,canopy),'r') as origFile: 45 | with open("{}/{}/gtClusters.tsv".format(newDir,canopy),'w') as newFile: 46 | for line in origFile: 47 | newFile.write(line) 48 | 49 | if __name__ == '__main__': 50 | origDir = "../data/NP_Coref_withDocNum" 51 | newDir = "../data/NP_Coref" 52 | 53 | # "This was to remove docNum present in front of each line in pairFeatures.tsv 54 | fixDataFormat(origDir=origDir, newDir=newDir) -------------------------------------------------------------------------------- /src/hier_clust/random_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import numpy as np 17 | import time 18 | from hier_clust.expLink import getPidToPredClusters, computeDendPurity 19 | 20 | 21 | def run_random_split(pidToCluster, k=None): 22 | 23 | numPoints = len(pidToCluster) 24 | activeClusters = [pid for pid in range(numPoints)] 25 | newCid = numPoints 26 | 27 | pidToParent = {} 28 | children = {pid:None for pid in activeClusters} 29 | 30 | y_pred = None 31 | while len(activeClusters) > 1: 32 | 33 | # Find clusters to merge 34 | 35 | cs = np.random.choice(activeClusters, 2, replace=False) # Random clusters to merge 36 | c1 = cs[0] 37 | c2 = cs[1] 38 | 39 | # Remove merged clusters for list 40 | activeClusters.remove(c1) 41 | activeClusters.remove(c2) 42 | 43 | # Update distances of the merged cluster with all remaining clusters 44 | activeClusters.append(newCid) 45 | 46 | children[newCid] = (c1,c2) 47 | pidToParent[c1] = newCid 48 | pidToParent[c2] = newCid 49 | 50 | if k is not None and len(activeClusters) == k: # Get flat clusters such that there are k clusters 51 | pidToPredCluster_k = getPidToPredClusters(numPoints=numPoints, pidToParent=pidToParent) 52 | y_pred = [pidToPredCluster_k[ pid ] for pid in range(numPoints)] 53 | 54 | newCid += 1 55 | 56 | if y_pred is None: # This is triggered when while loop terminated without forming flat clusters. it means that all points are put in 1 cluster 57 | y_pred = [1 for x in range(numPoints)] 58 | 59 | if pidToCluster is None: 60 | dendPurity = 0 61 | else: 62 | dendPurity = computeDendPurity(pidToCluster=pidToCluster, children=children, pidToParent=pidToParent) 63 | 64 | 65 | return y_pred, dendPurity 66 | 67 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | from utils.Config import Config 17 | from models.linearClassifier import LinearClassifier, AvgLinearClassifier 18 | from models.templateClassifier import Classifier 19 | from models.mahalabonis import GenLinkMahalanobis 20 | 21 | def create_new_model(config): 22 | """ Create a new model object based on the modelType field in the config 23 | 24 | :param config: 25 | :return: New created object 26 | """ 27 | assert isinstance(config,Config) 28 | if config.modelType == "linear": # Learn a pairwise classifier 29 | model = LinearClassifier(config) 30 | elif config.modelType == "avgLinear": # Learn a pairwise classifier and uses avgWeights at the end of training 31 | model = AvgLinearClassifier(config) 32 | elif config.modelType == "maha": 33 | model = GenLinkMahalanobis(config) 34 | elif config.modelType == "template": 35 | model = Classifier(config) # This class is just a template to use with skLearn classifiers with current code setup 36 | else: 37 | raise Exception("Unknown Model: {}".format(config.modelType)) 38 | 39 | return model 40 | 41 | # def load_model(config): 42 | # """ Load model object using the bestModel field in the config 43 | # 44 | # :param config: 45 | # :return:Loaded Model Object 46 | # """ 47 | # assert isinstance(config,Config) 48 | # if config.modelType == "linear": # Learn a pairwise classifier 49 | # model = LinearClassifier.load(config.bestModel) 50 | # elif config.modelType == "avgLinear": # Learn a pairwise classifier and uses avgWeights at the end of training 51 | # model = AvgLinearClassifier.load(config.bestModel) 52 | # elif config.modelType == "template": 53 | # model = Classifier() # This class is just a template to use with skLearn classifiers with current code setup 54 | # else: 55 | # raise Exception("Unknown Model: {}".format(config.modelType)) 56 | # 57 | # return model 58 | # -------------------------------------------------------------------------------- /src/models/templateClassifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | 17 | import torch 18 | import numpy as np 19 | 20 | class Classifier(torch.nn.Module): 21 | 22 | def __init__(self,config): 23 | super(Classifier, self).__init__() 24 | 25 | self.config = config 26 | self.clf = None 27 | self.seqModel = torch.nn.Sequential( 28 | torch.nn.Linear(self.config.inputDim,self.config.inputDim) 29 | ) 30 | 31 | def __str__(self): 32 | printStr = "" 33 | printStr += "-----------------Classifier Parameters-----------------------------" + "\n" 34 | printStr += str(self.clf) 35 | printStr += "-------------------------------------------------------------------" 36 | return printStr 37 | 38 | def getWeightStr(self): 39 | return "\n\nNo parameters\n\n" 40 | 41 | def pairForward(self, pairFeature): 42 | raise NotImplementedError 43 | # prediction = self.clf.predict(pairFeature) 44 | # return torch.autograd.Variable(torch.FloatTensor(prediction),requires_grad=False) 45 | 46 | def pairBatchForward(self, pairFeatureList): 47 | prediction = self.clf.predict(pairFeatureList) 48 | prediction = torch.FloatTensor(prediction).view(-1,1) 49 | return torch.autograd.Variable(prediction, requires_grad=False) 50 | 51 | def forward(self, point1, point2): 52 | raise NotImplementedError 53 | 54 | # This function does not return a pytorch Variable. 55 | # Just the distance between point1 and point2 as per current model 56 | def forwardPlain(self, point1, point2): 57 | raise NotImplementedError 58 | 59 | # Takes list of points and returns an adjacency matrix for it of size n x n 60 | def batchForwardWithin(self, points): 61 | raise NotImplementedError 62 | 63 | # Takes list of 2 points and returns an adjacency matrix for them of size n1 x n2 64 | def batchForwardAcross(self, pointList1, pointList2): 65 | raise NotImplementedError 66 | 67 | def batchForwardOneToOne(self, pointList1, pointList2): 68 | raise NotImplementedError 69 | 70 | 71 | 72 | if __name__ == '__main__': 73 | torch.manual_seed(2) 74 | np.random.seed(1) 75 | print("There is no code to run here...") -------------------------------------------------------------------------------- /src/utils/process_aminer_stats.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import math 17 | import numpy as np 18 | from utils.basic_utils import read_canopy_data 19 | import json 20 | from collections import defaultdict 21 | import itertools,csv 22 | 23 | def run(dataDir): 24 | 25 | canopyData = read_canopy_data(dataDir) 26 | 27 | all_measures= defaultdict(dict) 28 | for canopyId in canopyData: 29 | canopy = canopyData[canopyId] 30 | numEnts = len(canopy["clusterToPids"]) 31 | numMents = len(canopy["pidToCluster"]) 32 | avgMents = np.mean( [len(canopy["clusterToPids"][c]) for c in canopy["clusterToPids"]] ) 33 | stdMents = np.std( [len(canopy["clusterToPids"][c]) for c in canopy["clusterToPids"]] ) 34 | numSingletons = sum([1 for c in canopy["clusterToPids"] if len(canopy["clusterToPids"][c]) == 1 ]) 35 | all_measures["numEnts"][canopyId] = numEnts 36 | all_measures["numMents"][canopyId] = numMents 37 | all_measures["avgMents"][canopyId] = avgMents 38 | all_measures["stdMents"][canopyId] = stdMents 39 | all_measures["numSingletons"][canopyId] = numSingletons 40 | 41 | for measure in all_measures: 42 | json.dump(all_measures[measure], open("resources/aminer/aminer_{}.json".format(measure),"w")) 43 | 44 | all_measures["origin"] = json.load(open("resources/aminer/aminer_origin.json","r")) 45 | 46 | corrCoeff = {} 47 | 48 | for m1,m2 in itertools.combinations_with_replacement(all_measures,2): 49 | canopies = list(all_measures[m1].keys()) 50 | X_1 = [all_measures[m1][c] for c in canopies] 51 | X_2 = [all_measures[m2][c] for c in canopies] 52 | corrCoeff[m1,m2] = np.corrcoef(X_1, X_2)[0, 1] 53 | corrCoeff[m2,m1] = np.corrcoef(X_1, X_2)[1, 0] 54 | 55 | mlist = list(all_measures.keys()) 56 | with open("resources/aminer/aminer_correlation.csv","w") as f: 57 | f = csv.DictWriter( f,["Method"]+ mlist ) 58 | f.writeheader() 59 | for m1 in mlist: 60 | row = {"Method":m1} 61 | for m2 in mlist: 62 | row[m2] = "{:.3f}".format(corrCoeff[m1,m2]) 63 | 64 | f.writerow(row) 65 | 66 | 67 | 68 | 69 | 70 | 71 | if __name__ == "__main__": 72 | run("../data/authorCoref") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Supervised Hierarchical Clustering with Exponential Linkage 2 | This repository contains code used in experiments for our ICML 2019 paper titled "[Supervised Hierarchical Clustering with Exponential Linkage](http://proceedings.mlr.press/v97/yadav19a.html)". 3 | 4 | ## Setup ## 5 | 6 | Clone* and setup **xcluster** repository from . 7 | Make sure **xcluster** repo is cloned in the same folder as this repo i.e. you should have **xcluster** and **expLinkage** folder in the same parent folder. 8 | 9 | Set environment variables: 10 | 11 | ``` 12 | cd expLinkage 13 | source bin/setup.sh 14 | ``` 15 | 16 | ## Data Setup ## 17 | 18 | 19 | #### Data in *n*-dim vector space #### 20 | 21 | `clusterFile` parameter in config files should point to the tsv file which contains data with each line in following format: 22 | 23 | ` .... ` 24 | 25 | #### Data with features defined on every pair of points #### 26 | 27 | `dataDir` parameter in config files should point to data folder which should be present in the following format: 28 | ```bash 29 | ├── NP_Coref 30 | | ├── doc1 31 | | ├── gtClusters.tsv 32 | | ├── pairFearues.tsv 33 | | ├── doc2 34 | | ├── ... 35 | | ├── docn 36 | 37 | ``` 38 | 39 | All data should be in a single folder with a separate sub-folder for each canopy or set of points. Each sub-folder contains files: `gtClusters.tsv` and `pairFeatures.tsv`. 40 | 41 | `gtClusters.tsv` contains information about ground-truth clusters for each point in following format: 42 | ` ` 43 | 44 | `pairFeatures.tsv` contains feature vector for each pair of points in following format: 45 | ` ... ` 46 | 47 | Set of points in each subfolder will be clustered separately. 48 | 49 | ## Run Code ## 50 | 51 | #### For data in *n*-dim vector space #### 52 | 53 | ```bash 54 | cd expLinkage 55 | python src/trainer/train_vect_data.py --config= --seed= 56 | ``` 57 | 58 | #### For data with features on every pair of points #### 59 | 60 | ```bash 61 | cd expLinkage 62 | python src/trainer/train_pair_feat.py --config= --seed= 63 | ``` 64 | 65 | Config files for all experiments in the paper are present in [config](config) folder. 66 | 67 | 68 | ## Notes ## 69 | - *Code from **xcluster** repository is only used for evaluating dendrogram purity and is not crucial for training as such (if evaluation does not involve computing dendrogram purity or no evaluation on dev set is peformed during training). 70 | - Compatible cuda and pytorch versions: 71 | - cudnn : version 7.6.0, (build: cuda10.0_0) 72 | - pytorch : version 1.2.0 (build cuda100py36h938c94c_0) 73 | -------------------------------------------------------------------------------- /src/utils/processADANA.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import csv, itertools 17 | from pathlib import Path 18 | 19 | # Read feature vectors from dataDir and write them to outDir after processing, one canopy at a time 20 | def processADANA(dataDir, outDir): 21 | authorList = [str(f) for f in Path(dataDir).glob("*.xml") if f.is_file()] 22 | authorList = [authorFile[:-4] for authorFile in authorList] 23 | print("Author list:{}".format(authorList)) 24 | for authorFile in authorList: 25 | 26 | authorName = authorFile.split("/")[-1] 27 | if authorFile.endswith("Wei Wang"): 28 | print("Skipping {} because it does not have {}_ans.txt".format(authorFile,authorFile)) 29 | continue 30 | 31 | pairFeatures = {} 32 | pidToCluster = {} 33 | 34 | with open("{}_ans.txt".format(authorFile),"r") as f: 35 | for line in f: 36 | line =line.split() 37 | paperId, clusterId = int(line[0]),int(line[1]) 38 | pidToCluster[paperId] = clusterId 39 | 40 | # Initialize pairFeatures to empty list 41 | pidList = sorted(pidToCluster) 42 | for p1,p2 in itertools.combinations(pidList,2): 43 | pairFeatures[(p1,p2)] = [] 44 | 45 | with open("{}.txt".format(authorFile),"r") as f: 46 | numPapers = int(f.readline().strip()) 47 | for featNum in range(8): 48 | for i in range(numPapers-1): 49 | line = f.readline() 50 | line = [float(x) for x in line.strip().split()] 51 | for j,val in enumerate(line): 52 | pairFeatures[(i,i+j+1)].append(val) 53 | 54 | line = f.readline() # Read empty line between two feature matrices 55 | 56 | print("Writing down data for author:{}".format(authorFile)) 57 | Path("{}/{}".format(outDir, authorName)).mkdir(parents=True, exist_ok=True) 58 | with open("{}/{}/gtClusters.tsv".format(outDir, authorName), "w") as f: 59 | for pid in pidToCluster: 60 | f.write("{}\t{}\n".format(pid, pidToCluster[pid])) 61 | 62 | with open("{}/{}/pairFeatures.csv".format(outDir, authorName), "w") as f: 63 | writer = csv.writer(f) 64 | for p1, p2 in pairFeatures: 65 | line = [p1, p2] + pairFeatures[(p1, p2)] 66 | if pidToCluster[p1] == pidToCluster[p2]: 67 | line.append(1) 68 | else: 69 | line.append(0) 70 | 71 | writer.writerow(line) 72 | 73 | 74 | if __name__ == "__main__": 75 | 76 | dataDir = "../data/rich-author-disambiguation-data/experimental-results" 77 | outDir = "../data/authorCoref" 78 | processADANA(dataDir=dataDir, outDir=outDir) -------------------------------------------------------------------------------- /src/utils/createNPDataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import os 17 | from pathlib import Path 18 | from scipy.sparse import csr_matrix 19 | from scipy.sparse.csgraph import connected_components 20 | from utils.fixNPCorefDataFormat import fixDataFormat 21 | 22 | def createDataset(dataDir,outDir): 23 | 24 | for dataType in ["train","test"]: 25 | processPairwiseData(dataDir, dataType,outDir) 26 | 27 | # dataType= "train" or "test" 28 | def processPairwiseData(dataDir,dataType,outDir): 29 | with open(dataDir+"/"+dataType,"r") as f: 30 | fileList = f.read().split() 31 | 32 | Path(outDir).mkdir(parents=True, exist_ok=True) 33 | 34 | 35 | for file in fileList: 36 | Path("{}/{}".format(outDir, file)).mkdir(parents=True, exist_ok=True) 37 | featureFile = open("{}/{}/pairFeatures.csv".format(outDir, file), "w") 38 | rows, cols, data = [], [], [] 39 | uniquePts = {} 40 | with open("{}/{}/features.development/features.arff".format(dataDir, file), "r") as f: 41 | for line in f: 42 | if line.startswith("@"): continue 43 | if len(line.split(",")) < 2: continue 44 | 45 | featureFile.write(line) 46 | lineV = line.strip().split(",") 47 | docNum, id1, id2 = int(lineV[0]), int(lineV[1]), int(lineV[2]) 48 | 49 | uniquePts[id1] = 1 50 | uniquePts[id2] = 1 51 | if lineV[-1] == "+": 52 | # Accumulate data to create sparse matrix and then run connected components to retrieve gt clusters 53 | rows += [id1] 54 | cols += [id2] 55 | data += [1] 56 | 57 | rows += [id2] 58 | cols += [id1] 59 | data += [1] 60 | elif lineV[-1] == "-": 61 | pass 62 | else: 63 | print(lineV) 64 | raise Exception("Invalid end token") 65 | featureFile.close() 66 | 67 | numPoints = len(uniquePts) 68 | sparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPoints, numPoints)) 69 | connComp = connected_components(sparseMatrix) 70 | if file == "2": 71 | print(file, numPoints, connComp) 72 | 73 | with open("{}/{}/gtClusters.tsv".format(outDir,file),"w") as f: 74 | for id in range(numPoints): 75 | f.write("{}\t{}\n".format(id, connComp[1][id])) 76 | 77 | 78 | if __name__ == '__main__': 79 | tempOutDir = "../data/NP_Coref_temp" 80 | createDataset(dataDir="../data/reconcile/uw-corpus",outDir=tempOutDir) 81 | 82 | 83 | newDir = "../data/NP_Coref" 84 | 85 | # Remove docNum from this temporary dataset 86 | fixDataFormat(origDir=tempOutDir, newDir=newDir) 87 | 88 | os.system("rm -r {}".format(tempOutDir)) -------------------------------------------------------------------------------- /src/utils/projectFaces.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import sys 17 | import numpy as np 18 | from sklearn.decomposition import PCA 19 | 20 | from utils.basic_utils import read_clusters 21 | 22 | 23 | def projectFaces(filename, dim): 24 | clusterData = read_clusters(filename) 25 | 26 | pointList = [] 27 | indices = {} 28 | for cid in clusterData: 29 | start = len(pointList) 30 | pointList += clusterData[cid] 31 | end = len(pointList) 32 | indices[cid] = (start, end) 33 | 34 | pointList = [list(point) for point in pointList] 35 | pointList = np.array(pointList) 36 | # print(pointList.shape) 37 | 38 | pca = PCA(n_components=dim, random_state=0) 39 | X_prime = pca.fit_transform(pointList) 40 | print("Explained variance ratio for {} components\t{}\n{}".format(dim,pca.explained_variance_ratio_,sum(pca.explained_variance_ratio_))) 41 | 42 | # print(X_prime.shape) 43 | newClusterData = {} 44 | for cid in clusterData: 45 | start, end = indices[cid] 46 | newClusterData[cid] = X_prime[start:end] 47 | # print(newClusterData[cid].shape) 48 | 49 | with open("../data/faceData_{}.tsv".format(dim), "w") as writer: 50 | pointId = 0 51 | for cid in newClusterData: 52 | for point in newClusterData[cid]: 53 | row = "{}\t{}\t".format(pointId, cid) 54 | row += "\t".join("{:.2f}".format(x) for x in point) 55 | # print(row) 56 | writer.write(row + "\n") 57 | pointId += 1 58 | 59 | 60 | def normalizeFaces(filename): 61 | clusterData = read_clusters(filename) 62 | 63 | pointList = [] 64 | indices = {} 65 | for cid in clusterData: 66 | start = len(pointList) 67 | pointList += clusterData[cid] 68 | end = len(pointList) 69 | indices[cid] = (start, end) 70 | 71 | 72 | maxVal = 0. 73 | for cid in clusterData: 74 | for point in clusterData[cid]: 75 | tempMax = np.max([abs(x) for x in point]) 76 | maxVal = max(tempMax, maxVal) 77 | 78 | maxVal = 100 79 | newFilename = filename[:-4] + "_norm_10.tsv" 80 | with open(newFilename, "w") as writer: 81 | pointId = 0 82 | for cid in clusterData: 83 | for point in clusterData[cid]: 84 | row = "{}\t{}\t".format(pointId, cid) 85 | # Z = sum(point) 86 | Z = np.linalg.norm(point) 87 | origPoint = point 88 | point = [x/maxVal for x in point] 89 | # point = [x/Z for x in point] 90 | row += "\t".join("{:.2f}".format(x) for x in point) 91 | # row += "\t".join("{:.2f}".format(x) for x in origPoint) 92 | print(row) 93 | writer.write(row + "\n") 94 | pointId += 1 95 | 96 | print(maxVal) 97 | 98 | if __name__ == "__main__": 99 | 100 | dim = int(sys.argv[1]) 101 | projectFaces("../data/faceData.tsv",dim) 102 | -------------------------------------------------------------------------------- /src/utils/processRexa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import csv 17 | from pathlib import Path 18 | from scipy.sparse import csr_matrix 19 | from scipy.sparse.csgraph import connected_components 20 | 21 | # Read feature vectors from dataDir and write them to outDir after processing, one canopy at a time 22 | def processRexa(dataDir, outDir): 23 | 24 | folderList = [str(f) for f in Path(dataDir).glob("*") if f.is_dir()] 25 | 26 | for ctr, folder in enumerate(folderList): 27 | canopyId = folder.split("/")[-1] 28 | pairFeatures = {} 29 | mentToId = {} 30 | pidToCluster = {} 31 | rows,cols,data = [],[],[] 32 | with open("{}/pair_vecs.tsv".format(folder),"r") as f: 33 | reader = csv.reader(f,delimiter="\t") 34 | for line in reader: 35 | m1, m2 = line[0], line[1] 36 | featureVec = line[3:-1] 37 | 38 | pairFeatures[(m1, m2)] = featureVec 39 | mentToId[m1] = 1 40 | mentToId[m2] = 1 41 | if line[2] == "1": 42 | # Accumulate data to create sparse matrix and then run connected components to retrieve gt clusters 43 | rows += [m1] 44 | cols += [m2] 45 | data += [1] 46 | 47 | rows += [m2] 48 | cols += [m1] 49 | data += [1] 50 | elif line[2] == "0": 51 | pass 52 | else: 53 | print(line[2]) 54 | raise Exception("Invalid end token") 55 | 56 | mentToId = {ment:ctr for ctr,ment in enumerate(mentToId)} # Assign unique id to each point 57 | 58 | # Find out ground-truth cluster after running connected components 59 | rows = [mentToId[ment] for ment in rows] 60 | cols = [mentToId[ment] for ment in cols] 61 | numPoints = len(mentToId) 62 | sparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPoints, numPoints)) 63 | connComp = connected_components(sparseMatrix) 64 | 65 | for pid in range(numPoints): 66 | pidToCluster[pid] = connComp[1][pid] 67 | 68 | Path("{}/{}".format(outDir, canopyId)).mkdir(parents=True, exist_ok=True) 69 | with open("{}/{}/gtClusters.tsv".format(outDir, canopyId), "w") as f: 70 | for pid in pidToCluster: 71 | f.write("{}\t{}\n".format(pid, pidToCluster[pid])) 72 | 73 | with open("{}/{}/pairFeatures.csv".format(outDir, canopyId), "w") as f: 74 | writer = csv.writer(f) 75 | for m1,m2 in pairFeatures: 76 | line = [ mentToId[m1], mentToId[m2] ] + pairFeatures[(m1,m2)] 77 | 78 | if pidToCluster[mentToId[m1]] == pidToCluster[mentToId[m2]]: 79 | line.append(1) 80 | else: 81 | line.append(0) 82 | 83 | writer.writerow(line) 84 | 85 | if __name__ == "__main__": 86 | 87 | # dataDir = "../data/data/rexa/canopy" 88 | dataDir = "../data/data_rexa_all/nick-rexa/rexa/canopy" 89 | outDir = "../data/rexa_new" 90 | processRexa(dataDir=dataDir, outDir=outDir) -------------------------------------------------------------------------------- /src/trainer/BaseTrainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | from utils.Config import Config 17 | import torch 18 | import os 19 | 20 | class BaseTrainer(object): 21 | """docstring for Base Trainer Class""" 22 | 23 | def __init__(self, config): 24 | super(BaseTrainer, self).__init__() 25 | 26 | assert isinstance(config, Config) 27 | self.config = config 28 | self.logger = None 29 | self.optimizer = None 30 | self.trainCanopies = {} 31 | self.testCanopies = {} 32 | self.devCanopies = {} 33 | 34 | def __str__(self): 35 | return "Base Trainer Class" 36 | 37 | def train(self): 38 | raise NotImplementedError 39 | 40 | def loadModel(self): 41 | # Load model and reset optimizer to have parameters of the loaded model 42 | if os.path.isfile(self.config.bestModel): 43 | self.model = torch.load(self.config.bestModel) 44 | self.logger.info("Loading model from:{}".format(self.config.bestModel)) 45 | else: 46 | bestModel = os.path.join(self.config.resultDir, self.config.bestModel) 47 | if os.path.isfile(bestModel): 48 | self.model = torch.load(bestModel) 49 | self.logger.info("Loading model from:{}".format(bestModel)) 50 | else: 51 | try: 52 | bestModel = os.path.join(self.config.resultDir, "model_alpha.torch") 53 | self.model = torch.load(bestModel) 54 | self.logger.info("Loading model from:{}".format(bestModel)) 55 | except: 56 | bestModel = os.path.join(self.config.resultDir, "model.torch") 57 | self.model = torch.load(bestModel) 58 | self.logger.info("Loading model from:{}".format(bestModel)) 59 | 60 | self.resetOptimizer() 61 | 62 | def resetOptimizer(self): 63 | 64 | if self.config.trainObj == "linkage_auto": 65 | assert self.config.trainModel and self.config.trainAlpha 66 | 67 | if self.config.trainModel and self.config.trainAlpha : # Add model.seqModel parameters and linkAlpha to the optimizer 68 | assert self.config.trainObj == "linkage_auto" 69 | self.optimizer = torch.optim.Adam([{'params': self.model.seqModel.parameters(), 'lr': self.config.lr, 'weight_decay': self.config.l2Alpha}]) 70 | self.optimizer.add_param_group({'params': self.model.linkAlpha, 'lr': self.config.alphaLr}) 71 | 72 | elif (not self.config.trainModel) and self.config.trainAlpha: # Add linkAlpha to the optimizer 73 | self.optimizer = torch.optim.Adam([{'params': self.model.linkAlpha, "lr": self.config.alphaLr}]) 74 | 75 | elif self.config.trainModel and (not self.config.trainAlpha): # Add model.seqModel parameters to optimizer 76 | assert self.config.trainObj != "linkage_auto" 77 | self.optimizer = torch.optim.Adam([{'params': self.model.seqModel.parameters(), 'lr': self.config.lr, 'weight_decay': self.config.l2Alpha}]) 78 | 79 | else: 80 | self.optimizer = torch.optim.Adam() 81 | 82 | -------------------------------------------------------------------------------- /src/eval/evalF1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import time 17 | import os 18 | import itertools 19 | 20 | # y_true & y_pred is list of labels for each point. Recall, Precision, F1 is for predicted edges on underlying points 21 | def comp_prec_rec_f1_fast(y_true, y_pred): 22 | assert (len(y_true) == len(y_pred)) 23 | t1 = time.time() 24 | with open("predicted.tsv", "w") as predicted: 25 | for id, val in enumerate(y_pred): 26 | predicted.write(str(id) + "\t" + str(val) + "\n") 27 | 28 | with open("goldFile.tsv", "w") as goldFile: 29 | for id, val in enumerate(y_true): 30 | goldFile.write(str(id) + "\t" + str(val) + "\n") 31 | 32 | filenum = time.time() 33 | command = "cd $XCLUSTER_ROOT && source bin/setup.sh &&" 34 | command += "sh bin/util/score_pairwise.sh ../singleLinkage/predicted.tsv ../singleLinkage/goldFile.tsv algo data None > tempResult_{}".format(filenum) 35 | print("executing command::\n{}\n".format(command)) 36 | os.system(command) 37 | precision, recall, f1 = 0, 1, 0 38 | XCLUSTER_ROOT = os.getenv("XCLUSTER_ROOT") 39 | 40 | with open("{}/tempResult_{}".format(XCLUSTER_ROOT,filenum), "r") as results: 41 | for line in results: 42 | algo, data, precision, recall, f1 = line.split() 43 | precision = float(precision) 44 | recall = float(recall) 45 | f1 = float(f1) 46 | 47 | command = "rm {}/tempResult_{}".format(XCLUSTER_ROOT, filenum) 48 | print("executing command::\n{}\n".format(command)) 49 | os.system(command) 50 | t2 = time.time() 51 | print("Time taken = {:.3f}".format(t2 - t1)) 52 | return {"precision": precision, "recall": recall, "f1": f1} 53 | 54 | # y_true & y_pred is list of labels for each point. Recall, Precision, F1 is for predicted edges on underlying points 55 | def comp_prec_rec_f1(y_true, y_pred): # TODO Optimize this, we do not need to calculate trueNeg and that is a large fraction of all edges 56 | assert (len(y_true) == len(y_pred)) 57 | truePos = 0 58 | falseNeg = 0 59 | 60 | trueNeg = 0 61 | falsePos = 0 62 | numPoints = len(y_true) 63 | 64 | for pid1, pid2 in itertools.combinations(range(numPoints), 2): 65 | if y_pred[pid1] == y_pred[pid2]: 66 | if y_true[pid1] == y_true[pid2]: 67 | truePos += 1 # TP 68 | else: 69 | falsePos += 1 # FP 70 | else: 71 | if y_true[pid1] == y_true[pid2]: 72 | falseNeg += 1 # FN 73 | else: 74 | trueNeg += 1 # TN 75 | 76 | precision = truePos / (truePos + falsePos) if (truePos + falsePos) > 0 else 1. 77 | recall = truePos / (truePos + falseNeg) if (truePos + falseNeg) > 0 else 1. 78 | f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0. else 0. 79 | 80 | return {"precision": precision, "recall": recall, "f1": f1, 81 | "recall_num":truePos, "recall_den":truePos + falseNeg, 82 | "precision_num": truePos, "precision_den": truePos + falsePos} 83 | -------------------------------------------------------------------------------- /src/eval/evalDendPurity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import os 17 | import time 18 | 19 | def write_tree(children_, Y, X_labels, filename): 20 | """ 21 | 22 | The children of each non-leaf node. Values less than n_samples correspond to leaves of the tree which are the 23 | original samples. A node i greater than or equal to n_samples is a non-leaf node and has children 24 | children_[i - n_samples]. Alternatively at the i-th iteration, children[i][0] and children[i][1] 25 | are merged to form node n_samples + i 26 | 27 | 28 | 29 | Args: 30 | children_: 31 | Y: 32 | fn: 33 | 34 | Returns: 35 | 36 | """ 37 | num_samples = len(Y) 38 | with open(filename, 'w') as fout: 39 | for i in range(0, len(children_)): 40 | node_i_id = "id_" + str(i + num_samples) 41 | 42 | if children_[i][0] < num_samples: 43 | child_0_node_id = str(X_labels[int(children_[i][0])]) 44 | child_0_label = str(Y[int(children_[i][0])]) 45 | else: 46 | child_0_node_id = "id_" + str(int(children_[i][0])) 47 | child_0_label = "None" 48 | 49 | if children_[i][1] < num_samples: 50 | child_1_node_id = str(X_labels[int(children_[i][1])]) 51 | child_1_label = str(Y[int(children_[i][1])]) 52 | else: 53 | child_1_node_id = "id_" + str(int(children_[i][1])) 54 | child_1_label = "None" 55 | 56 | fout.write("{}\t{}\t{}\n".format(child_0_node_id, node_i_id, child_0_label)) 57 | fout.write("{}\t{}\t{}\n".format(child_1_node_id, node_i_id, child_1_label)) 58 | root_ = "id_" + str(len(children_) + num_samples - 1) 59 | fout.write("{}\tNone\tNone\n".format(root_)) 60 | 61 | def calc_dend_purity(linkTree, pidList, y_true): 62 | dendPurity = 0 63 | XCLUSTER_ROOT = os.getenv("XCLUSTER_ROOT") 64 | filenum = time.time() 65 | treeFileName = "{}/perchTree_{}.tree".format(XCLUSTER_ROOT, filenum) 66 | 67 | while os.path.isfile(treeFileName): 68 | filenum = time.time() 69 | treeFileName = "{}/perchTree_{}.tree".format(XCLUSTER_ROOT, filenum) 70 | 71 | 72 | if isinstance(linkTree, str): # If linkTree is already a formatted string then just write it 73 | with open(treeFileName, "w") as f: 74 | f.write(linkTree) 75 | else: 76 | write_tree(linkTree, y_true, pidList, treeFileName) 77 | 78 | 79 | assert os.path.isfile(treeFileName) 80 | 81 | command = "cd $XCLUSTER_ROOT && source bin/setup.sh && pwd && " 82 | command += "sh bin/util/score_tree.sh {} algo data 24 None > treeResult_{}".format(treeFileName, filenum) 83 | os.system(command) 84 | 85 | resultFileName = "{}/treeResult_{}".format(XCLUSTER_ROOT, filenum) 86 | with open(resultFileName, "r") as reader: 87 | for line in reader: 88 | algo, data, dendPurity = line.split() 89 | dendPurity = float(dendPurity) 90 | break 91 | 92 | command = "rm {} && rm {}".format(treeFileName, resultFileName) 93 | # print("Removing files:{}".format(command)) 94 | os.system(command) 95 | assert not os.path.isfile(treeFileName) 96 | assert not os.path.isfile(resultFileName) 97 | return dendPurity 98 | -------------------------------------------------------------------------------- /src/eval/evalMUCF1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import itertools 17 | from scipy.sparse import csr_matrix 18 | from scipy.sparse.csgraph import connected_components 19 | 20 | def calc_muc_score(pidToCluster_true, pidToCluster_pred): 21 | # For each predicted cluster, accumulate points in that cluster 22 | predClusterToPids = {} 23 | for idx, predCluster in enumerate(pidToCluster_pred): 24 | try: 25 | predClusterToPids[predCluster].append(idx) 26 | except: 27 | predClusterToPids[predCluster] = [idx] 28 | 29 | precNumerator, precDenominator = 0, 0 30 | for predCid in predClusterToPids: # Compute precision for each predicted cluster. Find connected component in each predicted cluster 31 | pidList = predClusterToPids[predCid] 32 | if len(pidList) <= 1: 33 | continue 34 | 35 | data, rows, cols = [], [], [] 36 | for p1, p2 in itertools.combinations(pidList, 2): 37 | if pidToCluster_true[p1] == pidToCluster_true[p2]: 38 | data += [1] 39 | rows += [p1] 40 | cols += [p2] 41 | 42 | data += [1] 43 | rows += [p2] 44 | cols += [p1] 45 | idMapping = {p: idx for idx, p in enumerate(pidList)} 46 | rows = [idMapping[p] for p in rows] 47 | cols = [idMapping[p] for p in cols] 48 | numPointInCluster = len(pidList) 49 | predClusterSparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPointInCluster, numPointInCluster)) 50 | 51 | numConnComp = connected_components(predClusterSparseMatrix)[0] 52 | precNumerator += numPointInCluster - numConnComp 53 | precDenominator += numPointInCluster - 1 54 | # print("Points in predCluster:{}\t{}\n{}/{}".format(predCid, pidList, numPointInCluster - numConnComp, numPointInCluster-1)) 55 | 56 | precision = precNumerator / precDenominator if precDenominator > 0 else 1 57 | 58 | trueClusterToPids = {} 59 | for idx, trueCluster in enumerate(pidToCluster_true): 60 | try: 61 | trueClusterToPids[trueCluster].append(idx) 62 | except: 63 | trueClusterToPids[trueCluster] = [idx] 64 | 65 | recallNumerator, recallDenominator = 0, 0 66 | for trueCid in trueClusterToPids: 67 | pidList = trueClusterToPids[trueCid] 68 | if len(pidList) <= 1: 69 | continue 70 | 71 | data, rows, cols = [], [], [] 72 | for p1, p2 in itertools.combinations(pidList, 2): 73 | if pidToCluster_pred[p1] == pidToCluster_pred[p2]: 74 | data += [1] 75 | rows += [p1] 76 | cols += [p2] 77 | 78 | data += [1] 79 | rows += [p2] 80 | cols += [p1] 81 | 82 | idMapping = {p: idx for idx, p in enumerate(pidList)} 83 | rows = [idMapping[p] for p in rows] 84 | cols = [idMapping[p] for p in cols] 85 | numPointInCluster = len(pidList) 86 | 87 | trueClusterSparseMatrix = csr_matrix((data, (rows, cols)), shape=(numPointInCluster, numPointInCluster)) 88 | numConnComp = connected_components(trueClusterSparseMatrix)[0] 89 | recallNumerator += numPointInCluster - numConnComp 90 | recallDenominator += numPointInCluster - 1 91 | # print("Points in trueCluster:{}\t{}\n{}/{}".format(trueCid, pidList, numPointInCluster - numConnComp, numPointInCluster - 1)) 92 | 93 | recall = recallNumerator / recallDenominator if recallDenominator > 0 else 1 94 | f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0 95 | 96 | return {"muc_precision": precision, "muc_recall": recall, "muc_f1": f1, 97 | "muc_precision_num": precNumerator, "muc_precision_den": precDenominator, 98 | "muc_recall_num": recallNumerator, "muc_recall_den": recallDenominator} 99 | -------------------------------------------------------------------------------- /src/utils/Config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import json 17 | import random 18 | import os 19 | import numpy as np 20 | import torch 21 | 22 | class Config(object): 23 | def __init__(self,filename=None): 24 | 25 | self.config_name = filename 26 | 27 | self.cuda = True 28 | self.useGPU = self.cuda and torch.cuda.is_available() 29 | self.seed = 1234 30 | 31 | 32 | self.mode = "train" 33 | self.resultDir = "auto" 34 | self.newDirSuffix = "" 35 | 36 | self.clusterFile = "" 37 | self.dataDir = "" 38 | self.logFile = "logFile.txt" 39 | self.bestModel = "" 40 | self.logConsole = True 41 | 42 | # Training Specific 43 | self.trainObj = "" 44 | self.threshold = 0. 45 | self.margin = 2. 46 | self.normalizeLoss = False # Normalize loss for training methods other than those starting with "linkage" 47 | self.normExpLinkLoss = True # Normalize loss for training methods starting with "linkage" 48 | self.trainExpLink = False 49 | self.scaleDist = False # Used with VectDataTrainer and ExpLink 50 | self.numErrorTriplet = 1 51 | 52 | self.numEpoch = 100 53 | self.numEpToAvg = 10 54 | self.epochToEval = 1000 55 | self.epochToWrite = 1000 56 | self.epsilon = 0.001 57 | self.makeScorePlots = True 58 | self.evalBeforeTrain = False 59 | self.evalOnTrainThresh = False 60 | self.evalOnTestThresh = False 61 | 62 | self.trainFrac = 0.6 63 | self.testFrac = 0.3 64 | self.devFrac = 0.1 65 | self.shuffleData = True 66 | 67 | # Eval Specific 68 | self.inferenceMethods = ["singleLink", "singleLink@t", "avgLink", "avgLink@t", "compLink", "compLink@t"] 69 | self.metricsForEval = ["f1", "randIndex", "dendPurity"] 70 | 71 | # Scoring Model Specific Parameters 72 | self.modelType = "" 73 | self.inputDim = 1 # Dataset specific 74 | self.outDisSim = True 75 | self.lr = 0.01 76 | self.l2Alpha = 0.01 77 | self.alphaLr = 0.01 78 | self.alphaInitMu = 0. 79 | self.alphaInitSigma = 0.01 80 | self.trainAlpha = True 81 | self.trainModel = True 82 | self.idenInit = False # Useful for Mahalanobis distance learner only 83 | 84 | 85 | 86 | if filename is not None: 87 | self.__dict__.update(json.load(open(filename))) 88 | 89 | # REDO Following three steps after updating any important parameter in config object 90 | self.useGPU = self.cuda and torch.cuda.is_available() 91 | self.updateRandomSeeds(self.seed) 92 | self.updateResultDir(self.resultDir) 93 | 94 | def to_json(self): 95 | return json.dumps(filter_json(self.__dict__),indent=4,sort_keys=True) 96 | 97 | def save_config(self, exp_dir, filename='config.json'): 98 | with open(os.path.join(exp_dir, filename), 'w') as fout: 99 | fout.write(self.to_json()) 100 | fout.write('\n') 101 | 102 | def __getstate__(self): 103 | state = dict(self.__dict__) 104 | if "logger" in state: 105 | del state['logger'] 106 | 107 | return state 108 | 109 | def updateResultDir(self, newResultDir): 110 | 111 | if newResultDir.startswith("auto"): 112 | miscInfo = newResultDir[4:] 113 | dataType = self.dataDir.split("/")[-1] 114 | self.resultDir = "{base}/d={d}/obj={obj}_s={s}{misc}".format( 115 | base="../results_refactor", 116 | d=dataType, 117 | obj=self.trainObj, 118 | s=self.seed, 119 | misc=miscInfo) 120 | else: 121 | self.resultDir = newResultDir 122 | 123 | 124 | def updateRandomSeeds(self, random_seed): 125 | 126 | self.seed = random_seed 127 | random.seed(random_seed) 128 | 129 | self.torch_seed = random.randint(0, 1000) 130 | self.np_seed = random.randint(0, 1000) 131 | self.cuda_seed = random.randint(0, 1000) 132 | 133 | torch.manual_seed(self.torch_seed) 134 | np.random.seed(self.np_seed) 135 | if self.useGPU and torch.cuda.is_available(): 136 | torch.cuda.manual_seed(self.cuda_seed) 137 | 138 | 139 | def filter_json(the_dict): 140 | res = {} 141 | for k in the_dict.keys(): 142 | if type(the_dict[k]) is str or \ 143 | type(the_dict[k]) is float or \ 144 | type(the_dict[k]) is int or \ 145 | type(the_dict[k]) is list or \ 146 | type(the_dict[k]) is bool or \ 147 | the_dict[k] is None: 148 | res[k] = the_dict[k] 149 | elif type(the_dict[k]) is dict: 150 | res[k] = filter_json(the_dict[k]) 151 | return res 152 | -------------------------------------------------------------------------------- /src/models/linearClassifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import torch 17 | from torch.autograd import Variable 18 | import numpy as np 19 | from utils.Config import Config 20 | 21 | class LinearClassifier(torch.nn.Module): 22 | """docstring for Linear Classifier""" 23 | 24 | def __init__(self, config): 25 | super(LinearClassifier, self).__init__() 26 | assert isinstance(config, Config) 27 | self.config = config 28 | self.inputDim = config.inputDim # Dimension of vector for each point 29 | self.outputDim = 1 30 | 31 | self.seqModel = torch.nn.Sequential( 32 | torch.nn.Linear(self.inputDim,self.outputDim) 33 | ) 34 | 35 | tempAlphaVal = np.random.normal(self.config.alphaInitMu, self.config.alphaInitSigma, 1)[0] 36 | if self.config.useGPU: 37 | self.linkAlpha = Variable(torch.cuda.FloatTensor([tempAlphaVal]), requires_grad=True) 38 | else: 39 | self.linkAlpha = Variable(torch.FloatTensor([tempAlphaVal]), requires_grad=True) 40 | 41 | 42 | def __str__(self): 43 | printStr = "" 44 | printStr += "-----------------Linear Classifier Parameters----------------------" + "\n" 45 | printStr += "linkAlpha:" + str(self.linkAlpha) + "\n" 46 | printStr += "inputDim::" + str(self.inputDim) + "\n" 47 | printStr += "output dissimilarity\t" + str(self.config.outDisSim) + "\n" 48 | printStr += "Layers::" + str(self.seqModel) + "\n" 49 | printStr += self.getWeightStr() 50 | 51 | printStr += "-------------------------------------------------------------------" 52 | return printStr 53 | 54 | def getWeightStr(self): 55 | weightStr = "" 56 | weightStr += "Weight::{}".format(self.seqModel[0].weight) + "\n" 57 | weightStr += "Bias::{}".format(self.seqModel[0].bias) + "\n" 58 | return weightStr 59 | 60 | def pairForward(self,pairFeature): 61 | if self.config.useGPU: 62 | pairFeature = Variable(torch.cuda.FloatTensor(pairFeature)) 63 | else: 64 | pairFeature = Variable(torch.Tensor(pairFeature)) 65 | 66 | prediction = self.seqModel(pairFeature) 67 | return prediction 68 | 69 | def pairBatchForward(self,pairFeatureList): 70 | if self.config.useGPU: 71 | pairFeatureList = Variable(torch.cuda.FloatTensor(pairFeatureList)) 72 | else: 73 | pairFeatureList = Variable(torch.Tensor(pairFeatureList)) 74 | 75 | prediction = self.seqModel(pairFeatureList) 76 | return prediction 77 | 78 | class AvgLinearClassifier(LinearClassifier): 79 | 80 | def __init__(self, config): 81 | super(AvgLinearClassifier, self).__init__(config) 82 | biasPresent = self.seqModel[0].bias is not None 83 | self.updateNum = 0 84 | self.avgWeights = torch.nn.Linear(self.inputDim, self.outputDim, bias=biasPresent) 85 | 86 | def __str__(self): 87 | printStr = "" 88 | printStr += "-----------------Average Linear Classifier Parameters-----------------------------" + "\n" 89 | printStr += "linkAlpha::\t" + str(self.linkAlpha) + "\n" 90 | printStr += "inputDim::\t" + str(self.inputDim) + "\n" 91 | printStr += "output dissimilarity\t" + str(self.config.outDisSim) + "\n" 92 | printStr += "updateNum" + str(self.updateNum) + "\n" 93 | printStr += "Layers::" + str(self.seqModel) + "\n" 94 | printStr += self.getWeightStr() 95 | printStr += "-------------------------------------------------------------------" 96 | return printStr 97 | 98 | def getWeightStr(self): 99 | weightStr = "" 100 | weightStr += "Weight::{}".format(self.seqModel[0].weight) + "\n" 101 | weightStr += "Bias::{}".format(self.seqModel[0].bias) + "\n" 102 | 103 | weightStr += "Avg Weight::{}".format(self.avgWeights.weight.data) + "\n" 104 | weightStr += "Avg Bias::{}".format(self.avgWeights.bias.data)+ "\n" 105 | return weightStr 106 | 107 | # Average weights after making gradient update 108 | def updateAvgWeights(self): 109 | 110 | self.avgWeights.weight.data = self.updateNum * self.avgWeights.weight.data + self.seqModel[0].weight.data 111 | if self.avgWeights.bias is not None: 112 | self.avgWeights.bias.data = self.updateNum * self.avgWeights.bias.data + self.seqModel[0].bias.data 113 | 114 | self.updateNum += 1 115 | self.avgWeights.weight.data = self.avgWeights.weight.data / self.updateNum 116 | if self.avgWeights.bias is not None: 117 | self.avgWeights.bias.data = self.avgWeights.bias.data / self.updateNum 118 | 119 | def pairAvgBatchForward(self, pairFeatureList): 120 | if self.config.useGPU: 121 | pairFeatureList = Variable(torch.cuda.FloatTensor(pairFeatureList)) 122 | else: 123 | pairFeatureList = Variable(torch.Tensor(pairFeatureList)) 124 | 125 | prediction = self.avgWeights(pairFeatureList) 126 | return prediction 127 | 128 | def pairAvgForward(self,pairFeature): 129 | if self.config.useGPU: 130 | pairFeature = Variable(torch.cuda.FloatTensor(pairFeature)) 131 | else: 132 | pairFeature = Variable(torch.Tensor(pairFeature)) 133 | 134 | prediction = self.avgWeights(pairFeature) 135 | return prediction 136 | 137 | -------------------------------------------------------------------------------- /src/hier_clust/recursive_sparsest_cut.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | 17 | """Run hierarchical sparsest cut.""" 18 | import argparse 19 | import datetime 20 | import numpy as np 21 | import uuid 22 | import os 23 | import sys 24 | 25 | from itertools import combinations 26 | 27 | from sklearn.cluster import SpectralClustering 28 | 29 | def log_exp_minus_dist(x, y): 30 | # return -((x - y).norm(2, 1)).unsqueeze(1) 31 | return np.linalg.norm(-(x - y)) 32 | 33 | 34 | def log_1_by_1p_dist(x, y): 35 | # return - torch.log1p(np.sqrt((x - y).norm(2, 1))).unsqueeze(1) 36 | return - np.log1p(np.linalg.norm(np.sqrt(x - y))) 37 | 38 | def sparsest_cut(sims): 39 | if len(sims) == 2: 40 | return [0],[1] 41 | else: 42 | spectral = SpectralClustering(n_clusters=2,n_jobs=-1,affinity='precomputed') 43 | labels = spectral.fit_predict(sims) 44 | # print("SC gives: ") 45 | # print(labels) 46 | left = np.where(labels==0)[0].astype(np.int) 47 | # print("left") 48 | # print(left) 49 | right = np.where(labels==1)[0].astype(np.int) 50 | # print("right") 51 | # print(right) 52 | return left,right 53 | 54 | def run(sim_file,label_file,out_file): 55 | sims = np.load(sim_file) 56 | labels = np.load(label_file) 57 | 58 | # (Node id, parent id, label, mat, objs) 59 | output = '' 60 | frontier = [(uuid.uuid4(), 'None', 'None', sims, np.arange(labels.shape[0]))] 61 | num_done = 0 62 | while frontier: 63 | # print("Splits on frontier: {}. Completed {}".format(len(frontier), num_done)) 64 | nid, pid, label, mat, obs = frontier.pop(0) 65 | output += '%s\t%s\t%s\n' % (nid, pid, label) 66 | if obs.shape[0] > 1: 67 | l, r = sparsest_cut(mat) 68 | # Sometimes, this sparsest cut will not split the nodes. If this is 69 | # the case, we need to manually split them. 70 | if np.size(l) == 0: 71 | raise Exception('bad case...') 72 | l = [0] 73 | r = list(range(1, len(obs))) 74 | if np.size(r) == 0: 75 | raise Exception('bad case...') 76 | r = [0] 77 | l = list(range(1, len(obs))) 78 | 79 | if np.size(l) > 1: 80 | l_nid = uuid.uuid4() 81 | l_label = 'None' 82 | else: 83 | assert (np.size(l) == 1) 84 | l_nid = obs[l[0]] 85 | l_label = labels[obs[l[0]]] 86 | 87 | if np.size(r) > 1: 88 | r_nid = uuid.uuid4() 89 | r_label = 'None' 90 | else: 91 | assert (np.size(r) == 1) 92 | r_nid = obs[r[0]] 93 | r_label = labels[obs[r[0]]] 94 | 95 | # print(obs) 96 | l_obs = np.array([obs[i] for i in l]) 97 | # print(l_obs) 98 | r_obs = np.array([obs[i] for i in r]) 99 | # print(r_obs) 100 | frontier.append((l_nid, nid, l_label, mat[l, :][:, l], l_obs)) 101 | frontier.append((r_nid, nid, r_label, mat[r, :][:, r], r_obs)) 102 | # print(num_done) 103 | 104 | now = datetime.datetime.now() 105 | ts = "{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}".format(now.year, now.month, 106 | now.day, now.hour, 107 | now.minute, 108 | now.second) 109 | out_dir = os.path.basename(out_file) 110 | if not os.path.exists(out_dir): 111 | os.makedirs(out_dir) 112 | 113 | with open(out_file, 'w') as fout: 114 | fout.write(output) 115 | 116 | 117 | def run_sparsest_cut(sims, labels ): 118 | 119 | # sims = np.load(sim_file) 120 | from scipy.spatial.distance import cdist 121 | # sims = cdist(transformedPointList,transformedPointList) 122 | # labels = np.array([pidToGtCluster[i] for i in range(len(pidToGtCluster))]) 123 | 124 | # (Node id, parent id, label, mat, objs) 125 | output = '' 126 | frontier = [(uuid.uuid4(), 'None', 'None', sims, np.arange(labels.shape[0]))] 127 | num_done = 0 128 | while frontier: 129 | # print("Splits on frontier: {}. Completed {}".format(len(frontier), num_done)) 130 | nid, pid, label, mat, obs = frontier.pop(0) 131 | output += '%s\t%s\t%s\n' % (nid, pid, label) 132 | if obs.shape[0] > 1: 133 | l, r = sparsest_cut(mat) 134 | # Sometimes, this sparsest cut will not split the nodes. If this is 135 | # the case, we need to manually split them. 136 | if np.size(l) == 0: 137 | raise Exception('bad case...') 138 | l = [0] 139 | r = list(range(1, len(obs))) 140 | if np.size(r) == 0: 141 | raise Exception('bad case...') 142 | r = [0] 143 | l = list(range(1, len(obs))) 144 | 145 | if np.size(l) > 1: 146 | l_nid = uuid.uuid4() 147 | l_label = 'None' 148 | else: 149 | assert (np.size(l) == 1) 150 | l_nid = obs[l[0]] 151 | l_label = labels[obs[l[0]]] 152 | 153 | if np.size(r) > 1: 154 | r_nid = uuid.uuid4() 155 | r_label = 'None' 156 | else: 157 | assert (np.size(r) == 1) 158 | r_nid = obs[r[0]] 159 | r_label = labels[obs[r[0]]] 160 | 161 | # print(obs) 162 | l_obs = np.array([obs[i] for i in l]) 163 | # print(l_obs) 164 | r_obs = np.array([obs[i] for i in r]) 165 | # print(r_obs) 166 | frontier.append((l_nid, nid, l_label, mat[l, :][:, l], l_obs)) 167 | frontier.append((r_nid, nid, r_label, mat[r, :][:, r], r_obs)) 168 | # print(num_done) 169 | 170 | return output 171 | 172 | 173 | if __name__ == '__main__': 174 | run(sys.argv[1], sys.argv[2], sys.argv[3]) 175 | -------------------------------------------------------------------------------- /src/utils/create_synth_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import csv, os, argparse, itertools, math 17 | from pathlib import Path 18 | 19 | from utils.basic_utils import read_canopy_data, read_clusters, read_clusters_synth 20 | from utils.plotting import plot_clusters, plot_clusters_w_edges 21 | 22 | def rotate(point,theta,anchor=(0,0)): 23 | point = point[0]-anchor[0],point[1]-anchor[1] 24 | point = math.cos(theta) * point[0] - math.sin(theta) * point[1], math.sin(theta) * point[0] + math.cos(theta) * point[1] 25 | point = point[0] + anchor[0], point[1] + anchor[1] 26 | return point 27 | 28 | # Reads clusters from file=filename and creates pairwise features for these clusters and stores them in dataDir 29 | def create_pairwise_data(filename, dataDir, squared=False): 30 | clusters = read_clusters_synth(filename) 31 | 32 | pointData = {} # Maps each pointId to (point,cid) tuple 33 | pid = 0 34 | for cid in clusters: 35 | for point in clusters[cid]: 36 | pointData[pid] = (point,cid) 37 | pid+=1 38 | 39 | Path(dataDir).mkdir(exist_ok=True, parents=True) 40 | with open("{}/gtClusters.tsv".format(dataDir),"w") as f: 41 | csvWriter = csv.writer(f, delimiter=" ") 42 | for pid in pointData.keys(): 43 | row = [pid,pointData[pid][1]] 44 | csvWriter.writerow(row) 45 | 46 | with open("{}/pairFeatures.csv".format(dataDir),"w") as f: 47 | csvWriter = csv.writer(f) 48 | for pid1, pid2 in itertools.combinations(pointData.keys(),2): 49 | 50 | featureVec = [abs(x1-x2) for x1,x2 in zip(pointData[pid1][0], pointData[pid2][0])] 51 | if squared: 52 | featureVec = [x**2 for x in featureVec] 53 | 54 | row = [pid1, pid2] + featureVec 55 | if pointData[pid1][1] == pointData[pid2][1]: 56 | row.append(1) 57 | else: 58 | row.append(0) 59 | 60 | csvWriter.writerow(row) 61 | 62 | # Reads clusters from file=filename and creates pairwise features for these clusters and stores them in dataDir 63 | # This one is specially written for generating different datasets for spiral clusters 64 | def create_pairwise_spiral(filename, dataDir, squared=False, theta=0., trimAt=None, pushMinValTo=None): 65 | clusters = read_clusters_synth(filename) 66 | 67 | pointData = {} # Maps each pointId to (point,cid) tuple 68 | pointToCluster = {} # Maps each point to its cluster 69 | pntCtr,pid = 0,0 70 | for cid in clusters: 71 | if cid == 2: continue 72 | # np.random.shuffle(clusters[cid]) 73 | for point in clusters[cid][40:90]: 74 | pntCtr += 1 75 | if pntCtr % 5 != 0: continue 76 | if cid == 1: 77 | newPoint = rotate(point, theta, (16, 15)) 78 | else: 79 | newPoint = point 80 | 81 | pointData[pid] = (newPoint, cid) 82 | pointToCluster[newPoint] = cid 83 | pid += 1 84 | 85 | Path(dataDir).mkdir(exist_ok=True, parents=True) 86 | plot_clusters(pointToCluster, dataDir + "/origData_{:.2f}.png".format(theta)) 87 | with open(dataDir+"/orig2D.txt","w") as writer: 88 | for point in pointToCluster: 89 | writer.write("{}\t{}\t{}\n".format(point[0],point[1],pointToCluster[point])) 90 | 91 | with open(dataDir+"/pidToPoint.txt","w") as writer: 92 | for pid in pointData: 93 | point = pointData[pid] 94 | writer.write("{}\t{}\t{}\n".format(pid,point[0][0],point[0][1])) 95 | 96 | with open("{}/gtClusters.tsv".format(dataDir), "w") as f: 97 | csvWriter = csv.writer(f, delimiter=" ") 98 | for pid in pointData.keys(): 99 | row = [pid, pointData[pid][1]] 100 | csvWriter.writerow(row) 101 | 102 | with open("{}/pairFeatures.csv".format(dataDir), "w") as f: 103 | csvWriter = csv.writer(f) 104 | for pid1, pid2 in itertools.combinations(pointData.keys(), 2): 105 | 106 | featureVec = [abs(x1 - x2) for x1, x2 in zip(pointData[pid1][0], pointData[pid2][0])] 107 | if squared: 108 | featureVec = [x ** 2 for x in featureVec] 109 | 110 | if trimAt is not None and featureVec[0] > trimAt: 111 | featureVec[0], featureVec[1] = featureVec[1], featureVec[0] 112 | featureVec[0] = min(trimAt, featureVec[0]) 113 | 114 | if pushMinValTo is not None and featureVec[0] + featureVec[1] < pushMinValTo: 115 | if featureVec[0] < featureVec[1]: 116 | featureVec[1] = pushMinValTo 117 | else: 118 | featureVec[0] = pushMinValTo 119 | 120 | row = [pid1, pid2] + featureVec 121 | if pointData[pid1][1] == pointData[pid2][1]: 122 | row.append(1) 123 | else: 124 | row.append(0) 125 | 126 | csvWriter.writerow(row) 127 | 128 | if __name__ == "__main__": 129 | 130 | # Command to generate spiral dataset with some rotation, with 2 spiral where MST and allPairs differ significantly 131 | # python scripts / create_synth_dataset.py - -file =../ data / sprial.txt - -outDir =../ data / spiralSmallRotated 132 | 133 | parser = argparse.ArgumentParser(description='Create dataset with edges = |p1-p2| from points in Rd') 134 | 135 | parser.add_argument('--file', type=str, required=True, help='File containing points in Rd') 136 | parser.add_argument('--outDir', type=str, required=True, help='Directory for newly created dataset') 137 | parser.add_argument('--sq', action="store_true", default=False, help='Square each component of edge?') 138 | 139 | args = parser.parse_args() 140 | 141 | filename = args.file # filename = "../data/sprial.txt" 142 | dataDir = args.outDir # dataDir = "../data/spiral_pw_sqd" 143 | 144 | 145 | # for theta in np.arange(0,3.14,0.1): 146 | for theta in [0.8]: 147 | create_pairwise_spiral(filename=filename,dataDir=dataDir+"/1",squared=args.sq,theta=theta) 148 | canopy = read_canopy_data(dataDir) 149 | plot_clusters_w_edges(canopy=canopy, model=None, filename=dataDir + "/1/edgeData_{:.2f}.png".format(theta)) 150 | 151 | # clusters = readClusters_synth(filename) 152 | # points = {} 153 | # for cid in clusters: 154 | # for point in clusters[cid]: 155 | # points[point] = cid 156 | # plotClusters(points, dataDir+"/1/origData.png") 157 | 158 | # canopy = readCanopyData(dataDir) 159 | # plotClustersEdges(canopy=canopy, model=None, filename=dataDir+"/1/edgeData") 160 | 161 | # dataDir = "../data/spiral_pw/1" 162 | # create_pairwise_spiral(filename, dataDir, False) 163 | 164 | # dataDir = "../data/spiral_pw_sqd/1" 165 | # create_pairwise_spiral(filename, dataDir, True) 166 | 167 | # dataDir = "../data/spiral_pw_sqd_trimmed/1" 168 | # create_pairwise_spiral(filename, dataDir, True, 20) 169 | 170 | # dataDir = "../data/spiral_pw_sqd_trimmed_larger/1" 171 | # create_pairwise_spiral(filename, dataDir, True, 20,8) 172 | -------------------------------------------------------------------------------- /src/models/mahalabonis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import torch 17 | from torch.autograd import Variable 18 | import numpy as np 19 | 20 | 21 | class MahalanobisDist(torch.nn.Module): 22 | 23 | def __init__(self, config): 24 | 25 | super(MahalanobisDist, self).__init__() 26 | self.config = config 27 | self.inputDim = config.inputDim 28 | self.outputDim = self.inputDim 29 | 30 | self.seqModel = torch.nn.Sequential( 31 | torch.nn.Linear(self.inputDim,self.outputDim, bias=False) 32 | ) 33 | 34 | if config.idenInit: # Initialize with Identity Matrix 35 | self.seqModel[0].weight.requires_grad = False 36 | self.seqModel[0].weight.data = torch.eye(self.config.inputDim) 37 | self.seqModel[0].weight.requires_grad = True 38 | 39 | def __str__(self): 40 | printStr = "" 41 | printStr += "-----------------Mahalanobis Distance Learner Parameters-----------------------------" + "\n" 42 | printStr += "inputDim::\t" + str(self.inputDim) + "\n" 43 | printStr += "Layers::" + str(self.seqModel) + "\n" 44 | printStr += "Parameters::" + str(list(self.parameters())) + "\n" 45 | printStr += "-------------------------------------------------------------------" 46 | return printStr 47 | 48 | def getWeightStr(self): 49 | weightStr = "Weight::{}".format(self.seqModel[0].weight) 50 | return weightStr 51 | 52 | # Returns numpy array after transforming every point according to Mahalanobis distance matrix 53 | def transformPoints(self, pointList): 54 | 55 | if self.config.useGPU: 56 | pointList = torch.cuda.FloatTensor(pointList) 57 | else: 58 | pointList = torch.Tensor(pointList) 59 | transformedPointList = self.seqModel(pointList) 60 | if self.config.useGPU: 61 | transformedPointList = transformedPointList.cpu().data.numpy() 62 | else: 63 | transformedPointList = transformedPointList.data.numpy() 64 | 65 | return transformedPointList 66 | 67 | def pairForward(self, pairFeature): 68 | if self.config.useGPU: 69 | pairFeature = Variable( 70 | torch.cuda.FloatTensor(pairFeature)) # take difference of two vectors to send as input 71 | else: 72 | pairFeature = Variable(torch.Tensor(pairFeature)) # take difference of two vectors to send as input 73 | 74 | prediction = torch.norm(self.seqModel(pairFeature),p=2).view(1) 75 | return prediction 76 | 77 | def pairBatchForward(self, pairFeatureList): 78 | listLen = len(pairFeatureList) 79 | if self.config.useGPU: 80 | pairFeatureList = Variable( 81 | torch.cuda.FloatTensor(pairFeatureList)) # take difference of two vectors to send as input 82 | else: 83 | pairFeatureList = Variable(torch.Tensor(pairFeatureList)) # take difference of two vectors to send as input 84 | 85 | prediction = torch.norm(self.seqModel(pairFeatureList),dim=1,p=2).view(listLen,1) 86 | assert prediction.shape == torch.Size([listLen,1]) 87 | return prediction 88 | 89 | def forward(self, point1, point2): 90 | 91 | if self.config.useGPU: 92 | p1 = torch.cuda.FloatTensor(point1) 93 | p2 = torch.cuda.FloatTensor(point2) 94 | else: 95 | p1 = torch.Tensor(point1) 96 | p2 = torch.Tensor(point2) 97 | 98 | embed1 = self.seqModel(p1) 99 | embed2 = self.seqModel(p2) 100 | distance = torch.norm(embed1 - embed2,p=2) 101 | return distance 102 | 103 | # This function does not return a pytorch Variable. 104 | # Just the Mahalabonis distance between point1 and point2 105 | def forwardPlain(self, point1, point2): 106 | 107 | distance = self.forward(point1, point2) 108 | if self.config.useGPU: 109 | distance = distance.cpu().data.numpy() 110 | else: 111 | distance = distance.data.numpy() 112 | return distance 113 | 114 | # Takes list of points and returns an adjacency matrix for it of size n x n 115 | def batchForwardWithin(self, points): 116 | numPoints = len(points) 117 | if self.config.useGPU: 118 | pointList1 = torch.cuda.FloatTensor(points) 119 | pointList2 = torch.cuda.FloatTensor(points) 120 | else: 121 | pointList1 = torch.Tensor(points) 122 | pointList2 = torch.Tensor(points) 123 | 124 | embedList1 = self.seqModel(pointList1).view(numPoints, 1, self.outputDim) 125 | embedList2 = self.seqModel(pointList2).view(1, numPoints, self.outputDim) 126 | 127 | # Use broadcasting feature to get nXn matrix where (i,j) contains ||p_i - p_j||_2 128 | distMatrix = torch.norm(embedList1 - embedList2, p=2, dim=2).view(numPoints, numPoints) 129 | 130 | return distMatrix 131 | 132 | # Takes list of 2 points and returns an adjacency matrix for them of size n1 x n2 133 | def batchForwardAcross(self, pointList1, pointList2): 134 | numPoint1 = len(pointList1) 135 | numPoint2 = len(pointList2) 136 | if self.config.useGPU: 137 | pointList1 = torch.cuda.FloatTensor(pointList1) 138 | pointList2 = torch.cuda.FloatTensor(pointList2) 139 | else: 140 | pointList1 = torch.Tensor(pointList1) 141 | pointList2 = torch.Tensor(pointList2) 142 | 143 | embedList1 = self.seqModel(pointList1).view(numPoint1, 1, self.outputDim) 144 | embedList2 = self.seqModel(pointList2).view(1, numPoint2, self.outputDim) 145 | 146 | # Use broadcasting feature to get nXn matrix where (i,j) contains ||p_i - p_j||_2 147 | distMatrix = torch.norm(embedList1 - embedList2, p=2, dim=2).view(numPoint1, numPoint2) 148 | return distMatrix 149 | 150 | # Returns distance between corresponding points in list 1 and list 2 151 | def batchForwardOneToOne(self, pointList1, pointList2): 152 | assert (len(pointList1) == len(pointList2)) 153 | numPoints = len(pointList1) 154 | if self.config.useGPU: 155 | pointList1 = torch.cuda.FloatTensor(pointList1).view(numPoints, self.inputDim) 156 | pointList2 = torch.cuda.FloatTensor(pointList2).view(numPoints, self.inputDim) 157 | else: 158 | pointList1 = torch.Tensor(pointList1).view(numPoints, self.inputDim) 159 | pointList2 = torch.Tensor(pointList2).view(numPoints, self.inputDim) 160 | 161 | embedList1 = self.seqModel(pointList1) 162 | embedList2 = self.seqModel(pointList2) 163 | 164 | distMatrix = (torch.norm(embedList1 - embedList2, p=2, dim=1)).view(numPoints, 1) 165 | return distMatrix 166 | 167 | class GenLinkMahalanobis(MahalanobisDist): 168 | 169 | def __init__(self, config): 170 | super(GenLinkMahalanobis, self).__init__(config) 171 | 172 | tempAlphaVal = np.random.normal(self.config.alphaInitMu, self.config.alphaInitSigma, 1)[0] 173 | if self.config.useGPU: 174 | self.linkAlpha = Variable(torch.cuda.FloatTensor([tempAlphaVal]), requires_grad=True) 175 | else: 176 | self.linkAlpha = Variable(torch.FloatTensor([tempAlphaVal]), requires_grad=True) 177 | 178 | def __str__(self): 179 | printStr = "" 180 | printStr += "-----------------General Linkage with Mahalanobis Distance Matrix: Parameters-----------------------------" + "\n" 181 | printStr += "linkAlpha::\t" + str(self.linkAlpha) + "\n" 182 | printStr += "inputDim::\t" + str(self.inputDim) + "\n" 183 | printStr += "Layers::" + str(self.seqModel) + "\n" 184 | printStr += "Parameters::" + str(list(self.parameters())) + "\n" 185 | printStr += "-------------------------------------------------------------------\n" 186 | return printStr 187 | -------------------------------------------------------------------------------- /src/trainer/train_vect_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import argparse, time, sys, os 17 | from pathlib import Path 18 | import torch 19 | 20 | from models.mahalabonis import MahalanobisDist, GenLinkMahalanobis 21 | 22 | from utils.Config import Config 23 | from utils.basic_utils import create_logger 24 | from eval.finalEval import run_final_eval 25 | 26 | from trainer.VectDataTrainer import VectDataTrainer 27 | 28 | def trainExpLinkOnly(trainer): 29 | if trainer.config.trainObj == "linkage_auto": 30 | trainer.logger.info("Not training linkageAlpha separately because if trainObj is linakge_auto then it must be trained already...") 31 | elif trainer.config.modelType == "maha": 32 | 33 | assert isinstance(trainer.model, MahalanobisDist) 34 | 35 | new_model = GenLinkMahalanobis(trainer.config) 36 | new_model.seqModel[0].weight.requires_grad = False 37 | new_model.seqModel[0].weight.data = trainer.model.seqModel[0].weight.data 38 | new_model.seqModel[0].weight.requires_grad = True 39 | trainer.model = new_model 40 | if trainer.config.useGPU: 41 | trainer.logger.info("Shifting model to cuda because GPUs are available!") 42 | trainer.model = trainer.model.cuda() 43 | 44 | trainer.config.trainAlpha = True 45 | trainer.config.trainModel = False 46 | trainer.resetOptimizer() 47 | 48 | if "linkage_auto" not in trainer.config.inferenceMethods: 49 | trainer.config.inferenceMethods += ["linkage_auto"] 50 | if "linkage_auto@t" not in trainer.config.inferenceMethods: 51 | trainer.config.inferenceMethods += ["linkage_auto@t"] 52 | 53 | origCSVFile = "{}/origTraining/results.csv" 54 | fileCheck = Path(origCSVFile.format(trainer.config.resultDir)) 55 | if not fileCheck.is_file(): 56 | print("File does not exist:{}".format(origCSVFile)) 57 | command = "cd {} && mkdir -p origTraining && cp *.csv origTraining/ && cp *.png origTraining/".format(trainer.config.resultDir) 58 | os.system(command) 59 | 60 | trainer.config.trainObj = "linkage_auto" 61 | trainer.logger.info("Training alpha parameter of expLink ...\n\n\n") 62 | trainer.logger.info(trainer.model) 63 | 64 | t1 = time.time() 65 | success = trainer.train() 66 | if success is not None and (not success): 67 | try: 68 | trainer.config.inferenceMethods.remove("linkage_auto@t") 69 | trainer.config.inferenceMethods.remove("linkage_auto") 70 | except: 71 | pass 72 | 73 | 74 | 75 | trainer.printModelWeights() 76 | trainer.logger.info("Training alpha parameter of expLink linkage ends...in time={:.3f}".format(time.time() - t1)) 77 | trainer.logger.info("Saving model...") 78 | 79 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model_alpha.torch") 80 | torch.save(trainer.model, trainer.config.bestModel ) 81 | trainer.config.save_config(trainer.config.resultDir, "config_expLink.json") 82 | trainer.logger.info("Saved model...") 83 | 84 | else: 85 | trainer.logger.info("Not training linkageAlpha separately because if modelType is not Mahalanobis distance matrix... ") 86 | 87 | def runMain(config): 88 | command = sys.argv 89 | start = time.time() 90 | assert isinstance(config,Config) 91 | if config.mode == "train": 92 | trainer = VectDataTrainer(config) 93 | trainer.printModelWeights() 94 | 95 | t1 = time.time() 96 | trainer.train() 97 | 98 | trainer.logger.info("Training ends...in time={:.3f}".format(time.time() - t1)) 99 | trainer.printModelWeights() 100 | trainer.logger.info("Saving model...") 101 | 102 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model.torch") 103 | torch.save(trainer.model, trainer.config.bestModel ) 104 | trainer.config.save_config(trainer.config.resultDir) 105 | trainer.logger.info("Saved model...") 106 | 107 | ################### Train alpha parameter for softLink ########################## 108 | 109 | if config.trainExpLink: 110 | trainExpLinkOnly(trainer) 111 | ################################################################################# 112 | 113 | elif config.mode == "trainExpLink": 114 | trainer = VectDataTrainer(config) 115 | 116 | # Load model and reset optimizer to have parameters of the loaded model 117 | trainer.loadModel() 118 | 119 | # Update output directory 120 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix 121 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 122 | 123 | # Update logger object 124 | trainer.logger = create_logger(config=config, logFile="logFile_trainExpLink.txt", currLogger=trainer.logger) 125 | 126 | trainer.logger.info(trainer) 127 | trainer.logger.info(command) 128 | trainExpLinkOnly(trainer) 129 | 130 | elif config.mode == "test": 131 | trainer = VectDataTrainer(config) 132 | 133 | # Load model and reset optimizer to have parameters of the loaded model 134 | trainer.loadModel() 135 | 136 | # Update output directory 137 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix 138 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 139 | 140 | # Update logger object 141 | trainer.logger = create_logger(config=config, logFile="logFile_retest.txt", currLogger=trainer.logger) 142 | 143 | trainer.logger.info(command) 144 | trainer.logger.info(trainer) 145 | 146 | else: 147 | raise Exception("Invalid mode = {}. Choose one from: test, train".format(config.mode)) 148 | 149 | 150 | run_final_eval(trainer) 151 | # trainer.performFinalEvaluation() 152 | trainer.logger.info("\n\n\n\n") 153 | 154 | trainer.logger.info(trainer) 155 | trainer.logger.info(command) 156 | end = time.time() 157 | trainer.logger.info(" Total time taken = {:.4f} = {:.4f} min = {:.4f} hours".format(end - start, (end - start)/60, (end - start)/3600)) 158 | 159 | if __name__ == '__main__': 160 | parser = argparse.ArgumentParser( description='Supervised clustering training for data in R^n') 161 | parser.add_argument('--config', type=str, help="Config file") 162 | 163 | ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS################################################### 164 | temp_config = Config() 165 | for config_arg in temp_config.__dict__: 166 | def_val = temp_config.__getattribute__(config_arg) 167 | arg_type = type(def_val) if def_val is not None else str 168 | parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used') 169 | ######################################################################################################### 170 | 171 | args = parser.parse_args() 172 | 173 | assert args.config is not None 174 | config = Config(args.config) 175 | for config_arg in temp_config.__dict__: 176 | def_val = getattr(args, config_arg) 177 | if def_val is not None: 178 | old_val = config.__dict__[config_arg] 179 | config.__dict__.update({config_arg:def_val}) 180 | new_val =config.__dict__[config_arg] 181 | print("Updating Config.{} from {} to {} using arg_val={}".format(config_arg, old_val, new_val, def_val)) 182 | 183 | # Update result directory if there are any parameters passed through command line that are different from those in config file 184 | if args.resultDir is None: 185 | config.updateResultDir("auto") 186 | else: 187 | config.updateResultDir(args.resultDir) 188 | 189 | Path(config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 190 | config.useGPU = config.cuda and torch.cuda.is_available() 191 | config.updateRandomSeeds(config.seed) 192 | config.save_config(config.resultDir, "orig_config.json") 193 | runMain(config) 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /src/trainer/train_pair_feat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | 17 | import argparse, time, sys, os 18 | from pathlib import Path 19 | import torch 20 | 21 | 22 | from utils.Config import Config 23 | from utils.basic_utils import create_logger 24 | from eval.finalEval import run_final_eval 25 | 26 | from models.linearClassifier import LinearClassifier 27 | from trainer.PairFeatureTrainer import PairFeatureTrainer 28 | 29 | def trainExpLinkOnly(trainer): 30 | 31 | assert isinstance(trainer, PairFeatureTrainer) 32 | 33 | if trainer.config.trainObj == "linkage_auto": 34 | trainer.logger.info("Not training linkageAlpha separately because if trainObj is linakge_auto then it must be trained already...") 35 | elif (trainer.config.modelType == "avgLinear" or trainer.config.modelType == "linear"): 36 | 37 | if trainer.config.modelType == "avgLinear": 38 | newModel = LinearClassifier(trainer.config) 39 | newModel.seqModel[0].weight.data = trainer.model.avgWeights.weight.data 40 | if trainer.model.seqModel[0].bias is not None: 41 | newModel.seqModel[0].bias.data = trainer.model.avgWeights.bias.data 42 | 43 | trainer.model = newModel 44 | elif trainer.config.modelType == "linear": 45 | newModel = LinearClassifier(trainer.config) 46 | newModel.seqModel[0].weight.data = trainer.model.seqModel[0].weight.data 47 | if trainer.model.seqModel[0].bias is not None: 48 | newModel.seqModel[0].bias.data = trainer.model.seqModel[0].bias.data 49 | 50 | trainer.model = newModel 51 | else: 52 | raise Exception("Invalid modelType..{}".format(trainer.config.modelType)) 53 | 54 | if trainer.config.useGPU: 55 | trainer.logger.info("Shifting model to cuda because GPUs are available!") 56 | trainer.model = trainer.model.cuda() 57 | 58 | trainer.config.trainAlpha = True 59 | trainer.config.trainModel = False 60 | trainer.resetOptimizer() 61 | 62 | if "linkage_auto" not in trainer.config.inferenceMethods: 63 | trainer.config.inferenceMethods += ["linkage_auto"] 64 | if "linkage_auto@t" not in trainer.config.inferenceMethods: 65 | trainer.config.inferenceMethods += ["linkage_auto@t"] 66 | 67 | origCSVFile = "{}/origTraining/results.csv" 68 | fileCheck = Path(origCSVFile.format(trainer.config.resultDir)) 69 | if not fileCheck.is_file(): 70 | print("File does not exist:{}".format(origCSVFile)) 71 | command = "cd {} && mkdir -p origTraining && cp *.csv origTraining/ && cp *.png origTraining/".format(trainer.config.resultDir) 72 | os.system(command) 73 | 74 | trainer.config.trainObj = "linkage_auto" 75 | trainer.logger.info("Training alpha parameter of expLink ...\n\n\n") 76 | trainer.logger.info(trainer.model) 77 | 78 | trainT1 = time.time() 79 | success = trainer.train() 80 | if success is not None and (not success): 81 | try: 82 | trainer.config.inferenceMethods.remove("linkage_auto@t") 83 | trainer.config.inferenceMethods.remove("linkage_auto") 84 | except: 85 | pass 86 | 87 | trainer.printModelWeights() 88 | 89 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model_alpha.torch") 90 | torch.save(trainer.model, trainer.config.bestModel ) 91 | trainer.config.save_config(trainer.config.resultDir, "config_expLink.json") 92 | 93 | trainT2 = time.time() 94 | trainer.logger.info("Training alpha parameter of expLink linkage ends in time={:.3f} = {:.3f} min = {:.3f} hr \n\n\n".format(trainT2 - trainT1,(trainT2 - trainT1)/60, (trainT2 - trainT1)/3600)) 95 | else: 96 | trainer.logger.info("Not training linkageAlpha separately because if modelType is not linear or avgLinear... ") 97 | 98 | def runMain(config): 99 | assert isinstance(config,Config) 100 | 101 | command = sys.argv 102 | start = time.time() 103 | 104 | if config.mode == "train": 105 | trainer = PairFeatureTrainer(config) 106 | trainer.logger.info(command) 107 | 108 | trainer.logger.info("Inital Weights of the model...") 109 | trainer.printModelWeights() 110 | 111 | 112 | trainT1 = time.time() 113 | trainer.train() 114 | trainT2 = time.time() 115 | trainer.logger.info("Training ends in time={:.3f} = {:.3f} min = {:.3f} hr Saving model".format(trainT2 - trainT1,(trainT2 - trainT1)/60,(trainT2 - trainT1)/3600)) 116 | 117 | trainer.logger.info("Weights that the model converged to...") 118 | trainer.printModelWeights() 119 | 120 | trainer.config.bestModel = os.path.join(trainer.config.resultDir, "model.torch") 121 | torch.save(trainer.model, trainer.config.bestModel ) 122 | trainer.config.save_config(trainer.config.resultDir) 123 | trainer.logger.info("Saved model...") 124 | 125 | if config.trainExpLink: 126 | trainExpLinkOnly(trainer) 127 | 128 | elif config.mode == "trainExpLink": 129 | trainer = PairFeatureTrainer(config) 130 | trainer.logger.info(command) 131 | 132 | # Load model and reset optimizer to have parameters of the loaded model 133 | trainer.loadModel() 134 | 135 | # Update output directory 136 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix 137 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 138 | 139 | # Update logger object 140 | trainer.logger = create_logger(config=config, logFile="logFile_trainExpLink.txt", currLogger=trainer.logger) 141 | 142 | trainer.logger.info(trainer) 143 | trainExpLinkOnly(trainer) 144 | 145 | elif config.mode == "test": 146 | trainer = PairFeatureTrainer(config) 147 | trainer.logger.info(command) 148 | 149 | # Load model and reset optimizer to have parameters of the loaded model 150 | trainer.loadModel() 151 | 152 | # Update output directory 153 | trainer.config.resultDir = trainer.config.resultDir + args.newDirSuffix 154 | Path(trainer.config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 155 | 156 | # Update logger object 157 | trainer.logger = create_logger(config=config, logFile="logFile_retest.txt", currLogger=trainer.logger) 158 | 159 | else: 160 | raise Exception("Invalid mode = {}. Choose one from: test, train or trainExpLink".format(config.mode)) 161 | 162 | 163 | t1 = time.time() 164 | run_final_eval(trainer) 165 | t2 = time.time() 166 | trainer.logger.info(" Total time taken for final evaluation = {:.4f} = {:.4f} min = {:.4f} hours".format(t2 - t1, (t2 - t1)/60, (t2 - t1)/3600)) 167 | 168 | trainer.logger.info(trainer) 169 | trainer.logger.info(command) 170 | end = time.time() 171 | trainer.logger.info(" Total time taken = {:.4f} = {:.4f} min = {:.4f} hours".format(end - start, (end - start)/60, (end - start)/3600)) 172 | 173 | if __name__ == '__main__': 174 | parser = argparse.ArgumentParser( description='Supervised clustering training with features given on every pair of points') 175 | 176 | temp_config = Config() 177 | parser.add_argument('--config', type=str, help="Config file") 178 | ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS################################################### 179 | for config_arg in temp_config.__dict__: 180 | def_val = temp_config.__getattribute__(config_arg) 181 | arg_type = type(def_val) if def_val is not None else str 182 | parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used') 183 | ######################################################################################################### 184 | 185 | args = parser.parse_args() 186 | 187 | assert args.config is not None 188 | config = Config(args.config) 189 | for config_arg in temp_config.__dict__: 190 | def_val = getattr(args, config_arg) 191 | if def_val is not None: 192 | 193 | old_val = config.__dict__[config_arg] 194 | config.__dict__.update({config_arg:def_val}) 195 | new_val =config.__dict__[config_arg] 196 | print("Updating Config.{} from {} to {} using arg_val={}".format(config_arg, old_val, new_val, def_val)) 197 | 198 | # Update result directory if there are any parameters passed through command line that are different from those in config file 199 | if args.resultDir is None: 200 | config.updateResultDir("auto") 201 | else: 202 | config.updateResultDir(args.resultDir) 203 | 204 | Path(config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 205 | config.useGPU = config.cuda and torch.cuda.is_available() 206 | config.updateRandomSeeds(config.seed) 207 | config.save_config(config.resultDir, "orig_config.json") 208 | 209 | runMain(config) 210 | -------------------------------------------------------------------------------- /src/trainer/scipy_perceptron.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | 17 | import numpy as np,argparse 18 | import torch 19 | from pathlib import Path 20 | 21 | from sklearn.linear_model import Perceptron 22 | from sklearn.linear_model import SGDClassifier 23 | from sklearn.svm import SVC 24 | 25 | from eval.evalPairFeat import get_conn_comp_pair_feat 26 | from utils.Config import Config 27 | from utils.plotting import plot_clusters_w_edges, plot_clusters 28 | from eval.threshold import choose_threshold 29 | 30 | from models.linearClassifier import LinearClassifier 31 | from models.templateClassifier import Classifier 32 | from PairFeatureTrainer import PairFeatureTrainer 33 | 34 | def getBestClassifier(modelType,seed,X,Y): 35 | classifiers = {} 36 | np.random.seed(seed) 37 | for i in range(10): 38 | if modelType == "SVMLinear": 39 | clf = SGDClassifier(loss="hinge", penalty="l2", tol=1e-9, alpha=0.01, max_iter=1000) # Linear SVM 40 | elif modelType == "SVMRbf": 41 | clf = SVC(gamma='auto', tol=1e-9, ) 42 | elif modelType == "Perceptron": 43 | clf = SGDClassifier(loss="perceptron", penalty="l2", tol=1e-9, alpha=0.01, max_iter=1000) 44 | elif modelType == "AvgPerceptron": 45 | clf = SGDClassifier(loss="perceptron", penalty="l2", tol=1e-9, alpha=0.01, max_iter=1000,average=True) 46 | elif modelType == "MST": 47 | clf = Perceptron(random_state=config.seed, penalty="l2", max_iter=1000, alpha=0.01, tol=1e-5, warm_start=True,shuffle=True) 48 | clf.fit(X, Y) # Doint this to just to get other class variable initialized 49 | # Optimal parameters as learnt by MST objective 50 | clf.coef_ = np.array([[-0.092749, -0.076006]]) 51 | clf.intercept_ = np.array([0.3871]) 52 | else: 53 | raise Exception("Invalid Model:{}".format(modelType)) 54 | 55 | # Initializing parameters. Need to set warm_start to True for this purpose. If shuffle is False then we get 56 | # same results for every random_state but if shuffle is True then we get different parameters because data is shuffled 57 | # at every iteration 58 | # clf = Perceptron(random_state=args.seed, penalty="l2", max_iter=1000, alpha=0.01, tol=1e-5, warm_start=True,shuffle=True) 59 | # clf.coef_ = np.array([[1,1]]) 60 | # clf.intercept_ = np.array([0]) 61 | # clf.fit(X, Y) 62 | # Optimal parameters as learnt by MST objective 63 | # clf.coef_ = np.array([[-0.092749, -0.076006]]) 64 | # clf.intercept_ = np.array([0.3871]) 65 | 66 | if modelType != "MST": 67 | clf.fit(X, Y) 68 | score = clf.score(X, Y) 69 | print("Accuracy on train data:{:.3f}".format(score)) 70 | classifiers[i] = (clf,score) 71 | 72 | bestClf = None 73 | bestScore = 0 74 | for i in classifiers.keys(): 75 | if bestClf is None or bestScore < classifiers[i][1]: 76 | bestClf = classifiers[i][0] 77 | bestScore = classifiers[i][1] 78 | 79 | print("Model with best Accuracy on train data:{:.3f}".format(bestScore)) 80 | return bestClf 81 | 82 | if __name__ == "__main__": 83 | 84 | parser = argparse.ArgumentParser("Run Scipy perceptron on pairwise data(synthetic points in R2)") 85 | parser.add_argument('--config', type=str, help="Config file") 86 | 87 | temp_config = Config() 88 | ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS################################################### 89 | for config_arg in temp_config.__dict__: 90 | def_val = temp_config.__getattribute__(config_arg) 91 | arg_type = type(def_val) if def_val is not None else str 92 | parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used') 93 | ######################################################################################################### 94 | 95 | args = parser.parse_args() 96 | 97 | assert args.config is not None 98 | config = Config(args.config) 99 | for config_arg in temp_config.__dict__: 100 | def_val = getattr(args, config_arg) 101 | if def_val is not None: 102 | 103 | old_val = config.__dict__[config_arg] 104 | config.__dict__.update({config_arg:def_val}) 105 | new_val =config.__dict__[config_arg] 106 | print("Updating Config.{} from {} to {} using arg_val={}".format(config_arg, old_val, new_val, def_val)) 107 | 108 | # Update result directory if there are any parameters passed through command line that are different from those in config file 109 | if args.resultDir is None: 110 | config.updateResultDir("auto") 111 | else: 112 | config.updateResultDir(args.resultDir) 113 | 114 | Path(config.resultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 115 | config.useGPU = config.cuda and torch.cuda.is_available() 116 | config.updateRandomSeeds(config.seed) 117 | config.save_config(config.resultDir, "orig_config.json") 118 | 119 | trainer = PairFeatureTrainer(config) 120 | resultDir = trainer.config.resultDir 121 | 122 | X,Y = [],[] 123 | for canopyId in trainer.trainCanopies: 124 | canopy = trainer.trainCanopies[canopyId] 125 | for (p1,p2) in canopy["pairFeatures"]: 126 | X.append(canopy["pairFeatures"][(p1,p2)]) 127 | label = 1 if canopy["pidToCluster"][p1] == canopy["pidToCluster"][p2] else 0 128 | Y.append(label) 129 | 130 | X, Y = np.array(X), np.array(Y) 131 | clf = getBestClassifier(config.model, config.seed, X, Y) 132 | 133 | if "spiral" in trainer.config.dataDir: 134 | pidToPoint = {} 135 | with open("{}/1/pidToPoint.txt".format(trainer.config.dataDir)) as f: 136 | for line in f: 137 | lineV = line.strip().split() 138 | pid, x1, x2 = int(lineV[0]), float(lineV[1]), float(lineV[2]) 139 | pidToPoint[pid] = (x1, x2) 140 | 141 | 142 | if hasattr(clf,"coef_"): 143 | b = clf.intercept_[0] 144 | m1,m2 = clf.coef_[0][0],clf.coef_[0][1] 145 | 146 | assert isinstance(trainer.model, LinearClassifier) 147 | trainer.model.seqModel[0].weight.data = torch.cuda.FloatTensor([[m1, m2]]) if config.useGPU else torch.FloatTensor([[m1, m2]]) 148 | trainer.model.seqModel[0].bias.data = torch.cuda.FloatTensor([b]) if config.useGPU else torch.FloatTensor([b]) 149 | optThresh = choose_threshold(trainer,"connComp", "1", trainer.trainCanopies) 150 | 151 | model = (m1, m2, b) 152 | optModel = (m1, m2, b - optThresh) 153 | plot_clusters_w_edges(trainer.trainCanopies, model, "{}/boundary_{}.pdf".format(resultDir, config.seed)) 154 | plot_clusters_w_edges(trainer.trainCanopies, optModel, "{}/boundaryOpt_{}.pdf".format(resultDir, config.seed)) 155 | # plotClustersEdges(trainer.trainCanopies, optModel, "{}/boundaryOptWithBase_{}.pdf".format(resultDir, config.seed), baseModel=model) 156 | plot_clusters_w_edges(trainer.trainCanopies, model, "{}/boundaryOptWithBase_{}.pdf".format(resultDir, config.seed), baseModel=optModel) 157 | elif isinstance(clf,SVC): 158 | trainer.model = Classifier(config) 159 | trainer.model.clf = clf 160 | optThresh = choose_threshold(trainer,"connComp", "1", trainer.trainCanopies) 161 | print("Opt threshold = {}".format(optThresh)) 162 | 163 | plot_clusters_w_edges(trainer.trainCanopies, clf, "{}/boundary_{}.png".format(resultDir, config.seed)) 164 | else: 165 | raise Exception("Invalid model:{}",clf) 166 | 167 | for canopyId in trainer.trainCanopies: 168 | canopy = trainer.trainCanopies[canopyId] 169 | pidToPredCluster = get_conn_comp_pair_feat(model=trainer.model, pairFeatures=canopy["pairFeatures"], 170 | pidToCluster=canopy["pidToCluster"], threshold=optThresh) 171 | pointToPredCluster = {} 172 | pointToTrueCluster = {} 173 | for pid in pidToPredCluster: 174 | point = pidToPoint[pid] 175 | pointToPredCluster[point] = pidToPredCluster[pid] 176 | pointToTrueCluster[point] = canopy["pidToCluster"][pid] 177 | 178 | 179 | plot_clusters(pointToCluster=pointToPredCluster, filename=trainer.config.resultDir + "/predClusterOptThresh_{}.pdf".format(config.seed)) 180 | plot_clusters(pointToCluster=pointToTrueCluster, filename=trainer.config.resultDir + "/trueCluster.pdf") 181 | 182 | pidToPredCluster = get_conn_comp_pair_feat(model=trainer.model, pairFeatures=canopy["pairFeatures"], 183 | pidToCluster=canopy["pidToCluster"], threshold=0) 184 | pointToPredCluster = {} 185 | pointToTrueCluster = {} 186 | for pid in pidToPredCluster: 187 | point = pidToPoint[pid] 188 | pointToPredCluster[point] = pidToPredCluster[pid] 189 | pointToTrueCluster[point] = canopy["pidToCluster"][pid] 190 | plot_clusters(pointToCluster=pointToPredCluster, filename=trainer.config.resultDir + "/predClusterLearnt.pdf".format(config.seed)) 191 | -------------------------------------------------------------------------------- /src/eval/finalEval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | 17 | from eval.evalPairFeat import eval_model_pair_feat_per_canopy 18 | from models.mahalabonis import MahalanobisDist 19 | from models.linearClassifier import AvgLinearClassifier, LinearClassifier 20 | from eval.threshold import choose_threshold 21 | from utils.plotting import write_scores_comb,write_scores_separate, plot_scores_per_canopy, plot_scores 22 | 23 | # Perform final evaluation of model 24 | def run_final_eval(trainer): 25 | 26 | # assert isinstance(trainer, VectDataTrainer) or isinstance(trainer, PairFeatureTrainer) 27 | 28 | trainer.logger.info("Choosing best threshold for evaluation in the end...") 29 | if isinstance(trainer.model, AvgLinearClassifier): 30 | trainer.logger.info("Loading average weights") 31 | trainer.model.seqModel[0].weight.data = trainer.model.avgWeights.weight.data 32 | if trainer.model.seqModel[0].bias is not None: 33 | trainer.model.seqModel[0].bias.data = trainer.model.avgWeights.bias.data 34 | 35 | trainer.logger.info("Weights being used for performing evaluation...") 36 | trainer.printModelWeights() 37 | 38 | trainer.config.threshold = None # Uncomment this line if you want to chooseThreshold 39 | ############################### Choose threshold based on dev canopy###################################### 40 | threshDict = {} 41 | for method in trainer.config.inferenceMethods: 42 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestDev") 43 | 44 | trainer.logger.info("Using dev thresholdVals:{}".format(threshDict)) 45 | eval_all_data(trainer, threshDict, "/BestDevThresh") 46 | ########################################################################################################### 47 | 48 | ############################### Choose threshold based on test canopy###################################### 49 | if len(trainer.testCanopies) > 0 and trainer.config.evalOnTestThresh: 50 | threshDict = {} 51 | for method in trainer.config.inferenceMethods: 52 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTest", canopies=trainer.testCanopies) 53 | 54 | trainer.logger.info("Using test thresholdVals:{}".format(threshDict)) 55 | eval_all_data(trainer, threshDict, "/BestTestThresh") 56 | ########################################################################################################### 57 | 58 | ##################################### Choose threshold based on train canopy############################## 59 | if trainer.config.evalOnTrainThresh: 60 | threshDict = {} 61 | for method in trainer.config.inferenceMethods: 62 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTrain", canopies=trainer.trainCanopies) 63 | 64 | trainer.logger.info("Using train thresholdVals:{}".format(threshDict)) 65 | eval_all_data(trainer, threshDict, "/BestTrainThresh") 66 | ########################################################################################################### 67 | pass 68 | 69 | def eval_all_data(trainer, threshDict, relResultDir = None): 70 | allScores = {"train": {}, "test": {}, "dev": {}} 71 | 72 | # Not using config.infertenceMethods as sometimes we want to just evaluate on just 1 inference methods during training 73 | infMethods = [method for method in threshDict.keys()] 74 | 75 | allScores["test"][0] = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=trainer.testCanopies, threshDict=threshDict, 76 | inferenceMethods=infMethods, metricsForEval=trainer.config.metricsForEval) 77 | 78 | allScores["dev"][0] = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=trainer.devCanopies, threshDict=threshDict, 79 | inferenceMethods=infMethods, metricsForEval=trainer.config.metricsForEval) 80 | 81 | allScores["train"][0] = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=trainer.trainCanopies, threshDict=threshDict, 82 | inferenceMethods=infMethods, metricsForEval=trainer.config.metricsForEval) 83 | 84 | if relResultDir is not None: 85 | if trainer.config.makeScorePlots: 86 | plot_scores(allLosses={"train":{}, "test":{}, "dev":{}}, allScores=allScores, 87 | currResultDir=trainer.config.resultDir + relResultDir, xlabel="Threshold") 88 | 89 | write_scores_comb(allLosses={"train": {}, "test": {}, "dev": {}}, allScores=allScores, 90 | currResultDir=trainer.config.resultDir + relResultDir, xlabel="Threshold") 91 | 92 | write_scores_separate(allLosses={"train": {}, "test": {}, "dev": {}}, allScores=allScores, 93 | currResultDir=trainer.config.resultDir + relResultDir, xlabel="Threshold") 94 | 95 | return allScores["train"][0], allScores["test"][0], allScores["dev"][0] 96 | 97 | def run_final_eval_per_canopy(trainer): 98 | 99 | from trainer.PairFeatureTrainer import PairFeatureTrainer 100 | assert isinstance(trainer, PairFeatureTrainer) 101 | 102 | trainer.logger.info("Choosing optimal threshold and running model with average weights for that...".format()) 103 | 104 | if isinstance(trainer.model, AvgLinearClassifier): 105 | trainer.model.seqModel[0].weight.data = trainer.model.avgWeights.weight.data 106 | if trainer.model.seqModel[0].bias is not None: 107 | trainer.model.seqModel[0].bias.data = trainer.model.avgWeights.bias.data 108 | 109 | if isinstance(trainer.model, MahalanobisDist) or isinstance(trainer.model, LinearClassifier): 110 | trainer.logger.info("Weights being used for performing evalutaion...") 111 | trainer.logger.info("Weight::{}".format(trainer.model.seqModel[0].weight)) 112 | trainer.logger.info("Bias::{}".format(trainer.model.seqModel[0].bias)) 113 | 114 | trainer.logger.info("Choosing best threshold for evaluation in the end...") 115 | 116 | trainer.config.threshold = None # Uncomment this line if you want to chooseThreshold 117 | 118 | ############################### Choose threshold based on dev canopy###################################### 119 | threshDict = {} 120 | for method in trainer.config.inferenceMethods: 121 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestDev") 122 | 123 | trainer.logger.info("Using dev thresholdVals:{}".format(threshDict)) 124 | eval_all_data_per_canopy(trainer, threshDict, "/BestDevThresh") 125 | ########################################################################################################### 126 | 127 | 128 | ############################### Choose threshold based on test canopy###################################### 129 | if len(trainer.testCanopies) > 0: 130 | threshDict = {} 131 | for method in trainer.config.inferenceMethods: 132 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTest", 133 | canopies=trainer.testCanopies) 134 | 135 | trainer.logger.info("Using test thresholdVals:{}".format(threshDict)) 136 | eval_all_data_per_canopy(trainer, threshDict, "/BestTestThresh") 137 | ########################################################################################################### 138 | 139 | ####################################3# Choose threshold based on train canopy############################## 140 | threshDict = {} 141 | for method in trainer.config.inferenceMethods: 142 | threshDict[method] = choose_threshold(trainer, infMethod=method, epoch="END_BestTrain", 143 | canopies=trainer.trainCanopies) 144 | 145 | trainer.logger.info("Using train thresholdVals:{}".format(threshDict)) 146 | eval_all_data_per_canopy(trainer, threshDict, "/BestTrainThresh") 147 | ########################################################################################################### 148 | pass 149 | 150 | def eval_all_data_per_canopy(trainer, threshDict, relResultDir): 151 | allScores = {} 152 | 153 | # def eval_model_pair_feat_per_canopy(model, canopies, inferenceMethods, threshDict, logger, metricsForEval) 154 | 155 | allScores["test"] = eval_model_pair_feat_per_canopy(model=trainer.model, canopies=trainer.testCanopies, logger=trainer.logger, 156 | threshDict=threshDict, inferenceMethods=trainer.config.inferenceMethods, metricsForEval=trainer.config.metricsForEval) 157 | 158 | allScores["dev"] = eval_model_pair_feat_per_canopy(model=trainer.model, canopies=trainer.devCanopies, logger=trainer.logger, 159 | threshDict=threshDict, inferenceMethods=trainer.config.inferenceMethods, metricsForEval=trainer.config.metricsForEval) 160 | 161 | allScores["train"] = eval_model_pair_feat_per_canopy(model=trainer.model, canopies=trainer.trainCanopies, logger=trainer.logger, 162 | threshDict=threshDict, inferenceMethods=trainer.config.inferenceMethods, metricsForEval=trainer.config.metricsForEval) 163 | 164 | plot_scores_per_canopy(allScores=allScores, currResultDir=trainer.config.resultDir + relResultDir) 165 | -------------------------------------------------------------------------------- /src/utils/combineResults.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import csv,os,argparse,copy 17 | import numpy as np 18 | from pathlib import Path 19 | 20 | from utils.plotting import plotMetricsFromCSV 21 | from utils.basic_utils import get_filename_list 22 | from utils.Config import Config 23 | 24 | def combineResults(parameters, xlabel, currResultDir, template): 25 | """ 26 | Put together results from all files into 1 file 27 | :param parameters: Dictionary with key as parameter names and value as a list of parameter values which need to be combined 28 | :param xlabel: Varying dimension 29 | :param currResultDir: 30 | :param template: template for folder name where results are read from for combining 31 | :return: 32 | """ 33 | 34 | filenameList = get_filename_list(parameters, template) 35 | data = {} 36 | header = None 37 | numFiles = 0 38 | # Read data from all files 39 | for filenum,filename in enumerate(filenameList): 40 | data[filenum] = {} 41 | 42 | fileCheck = Path(filename) 43 | if not fileCheck.is_file(): 44 | print("pwd:{}".format(os.getcwd())) 45 | print("File does not exist:{}".format(filename)) 46 | continue 47 | 48 | numFiles +=1 49 | with open(filename, "r") as f: # Read data from this file into a dictionary 50 | csvReader = csv.DictReader(f) 51 | for row in csvReader: 52 | if header is None: header = list(row.keys()) # Get header names 53 | 54 | for col in header: # Convert all row values to float if they can be else assign 0 value(these columns must be empty) 55 | try: 56 | row[col] = float(row[col]) 57 | except ValueError: # ValueError because these col must be empty 58 | assert row[col] == "" 59 | row[col] = 0 60 | 61 | # xlabelValue = None 62 | # for col in header: # Find value of xDim for this row 63 | # if col == xlabel: 64 | # xlabelValue = row[col] 65 | # break 66 | xlabelValue = row[xlabel] if xlabel in row else None 67 | 68 | assert xlabelValue is not None 69 | data[filenum][xlabelValue] = {} 70 | for col in header: # Add data for all col in data dictionary as list of values 71 | if col == xlabel: continue 72 | data[filenum][xlabelValue][col] = row[col] 73 | 74 | assert len(data[filenum]) == 1 75 | 76 | # Compute best result for each file 77 | finalData = {} 78 | for filenum in data: 79 | if len(data[filenum].keys()) == 0: 80 | print("Ignoring file:{}\n".format(filenum)) 81 | bestxDimValue = None 82 | continue 83 | else: 84 | assert len(data[filenum]) == 1 85 | bestxDimValue = list(data[filenum].keys())[0] 86 | 87 | bestRow = {} 88 | for col in data[filenum][bestxDimValue]: 89 | bestRow[col] = data[filenum][bestxDimValue][col] 90 | finalData[filenum] = (bestxDimValue,copy.deepcopy(bestRow)) 91 | 92 | # Write csv file containing best results from all files 93 | with open(currResultDir + "/results.csv", "w") as f: 94 | csvWriter = csv.DictWriter(f, fieldnames=header+["FileNum"]) 95 | csvWriter.writeheader() 96 | 97 | for filenum in range(numFiles): 98 | if filenum in finalData: 99 | tempDict = copy.deepcopy(finalData[filenum][1]) 100 | tempDict[xlabel] = finalData[filenum][0] # Add xDim to dictionary when writing data 101 | tempDict["FileNum"] = filenum 102 | 103 | csvWriter.writerow(tempDict) 104 | else: 105 | pass 106 | # print("Filenum not included in best result, possibly because choosing a threshold failed for this file:{}".format(filenum)) 107 | 108 | print("\nIgnoring orginal standard deviations when computing avg of best results") 109 | print("File will have standard deviation of best mean scores\n") 110 | # Write csv file containing avg of best results from all files 111 | with open(currResultDir + "/avgOfBestResults.csv", "w") as f: 112 | csvWriter = csv.DictWriter(f, fieldnames=header) 113 | csvWriter.writeheader() 114 | 115 | avgData = {col:[] for col in header} 116 | 117 | for col in header: 118 | if col.endswith("_std"): # If commenting this, also comment computation of std deviation of best scores 119 | continue 120 | # print("Ignoring deviation of best scores, just reporting average of best scores, and average of their std deviations") 121 | 122 | for filenum in range(numFiles): 123 | if filenum in finalData: 124 | if col == xlabel: 125 | avgData[col] += [finalData[filenum][0]] 126 | else: 127 | avgData[col] += [finalData[filenum][1][col]] 128 | else: 129 | pass 130 | # print("Filenum not included in best result, possibly because choosing a threshold failed for this file:{}".format(filenum)) 131 | 132 | if col.endswith("_mean"): # Computing std deviation of best scores 133 | avgData[col[:-5]+"_std"] = np.std(avgData[col]) 134 | avgData[col] = np.mean(avgData[col]) 135 | 136 | for col in avgData: 137 | if isinstance(avgData[col], float): 138 | avgData[col] = "{:0.4f}".format(avgData[col]) 139 | 140 | csvWriter.writerow(avgData) 141 | 142 | def run_combineResults(baseResDir, outDirPrefix, xlabel, baseTemplate, relResultDir, parameters): 143 | """ 144 | 145 | :param baseResDir: 146 | :param outDirPrefix: Prefix to be used for directory where results will be stored 147 | :param xlabel: Dimension along which best rows have to be found. eg=Threshold, Epoch 148 | :param baseTemplate: Template structure for folder where results are stored 149 | :param relResultDir: Folder where result.csv file is present,(relative to result directory where training results are stored) 150 | :param parameters: Dictionary with key as parameter names and value as a list of parameter values which need to be combined 151 | :return: 152 | """ 153 | origWorkDir = os.getcwd() 154 | os.chdir(baseResDir) 155 | 156 | currResultDir = "{outDirPrefix}_xlabel={xlabel}/{base}".format(outDirPrefix=outDirPrefix, xlabel=xlabel, base=baseTemplate) 157 | currResultDir = currResultDir.format(**parameters) 158 | Path(currResultDir).mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 159 | print("CurrResultDir:{}".format(currResultDir)) 160 | 161 | ############### Combine Results ############################ 162 | template = baseTemplate + "/{}/results.csv".format(relResultDir) 163 | combineResults(parameters, xlabel, currResultDir, template) 164 | 165 | ################ Plot Results ############################## 166 | # os.chdir(origWorkDir) 167 | # currResultDir = baseResDir + "/" + currResultDir 168 | # plotMetricsFromCSV(currResultDir=currResultDir, xlabel="FileNum") 169 | 170 | 171 | if __name__ == "__main__": 172 | 173 | 174 | parser = argparse.ArgumentParser(description='Combine results from different runs Ex: python -m scripts.combineResults --outDirPrefix=BestF1_AvgW --baseResDir=../results/c=NP_Coref --relResultDir=varyThresAvgWeights_f1 --xlabel=Threshold --trainObj=allWithin_allAcross --threshold=0.0 --margin=5 --modelType=avgLinear --trainFrac=0.6 --testFrac=0.3 --devFrac=0.1 --seed 1 2 3 4 5 6 7 8 9 10') 175 | 176 | # ################################## OPTIONAL ARGUMENTS TO OVERWRITE CONFIG FILE ARGS################################################### 177 | # temp_config = Config() 178 | # for config_arg in temp_config.__dict__: 179 | # if config_arg == "seed": continue 180 | # def_val = temp_config.__getattribute__(config_arg) 181 | # arg_type = type(def_val) if def_val is not None else str 182 | # parser.add_argument('--{}'.format(config_arg), type=arg_type, default=None, help='If not specified then value from config file will be used') 183 | # ######################################################################################################### 184 | 185 | parser.add_argument('--config', type=str,required=True, help='Config file') 186 | parser.add_argument('--seed', nargs='+',required=True, type=int, help="seed for random number generator") 187 | parser.add_argument('--xlabel', type=str,required=True, help='X-Label') 188 | parser.add_argument('--baseResDir', type=str, required=True,help='Directory where all result folders are stored') 189 | parser.add_argument('--suffix', type=str, default="", help="Suffix at end of each directory") 190 | parser.add_argument('--relResultDir', type=str,required=True, help='Name of folder where results.csv file is present(relative to folder where training results are stored') 191 | parser.add_argument('--outDirPrefix', type=str,required=True, help='Prefix to be used for directory where results will be stored') 192 | 193 | args = parser.parse_args() 194 | config = Config(args.config) 195 | 196 | parameters = {} 197 | parameters["d"] = config.dataDir.split("/")[-1] 198 | parameters["obj"] = config.trainObj 199 | parameters["s"] = args.seed 200 | xlabel = args.xlabel 201 | 202 | if args.suffix != "": 203 | parameters["suff"] = [args.suffix] 204 | baseTemplate = "obj={obj}_s={s}{suff}" 205 | else: 206 | baseTemplate = "obj={obj}_s={s}" 207 | 208 | run_combineResults(baseResDir=args.baseResDir, outDirPrefix=args.outDirPrefix, xlabel=args.xlabel, 209 | baseTemplate=baseTemplate, relResultDir=args.relResultDir, parameters=parameters) 210 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | name: base 2 | channels: 3 | - pytorch 4 | - lenskit 5 | - defaults 6 | dependencies: 7 | - _ipyw_jlab_nb_ext_conf=0.1.0=py36he11e457_0 8 | - _libgcc_mutex=0.1=main 9 | - _pytorch_select=0.2=gpu_0 10 | - _tflow_select=2.1.0=gpu 11 | - alabaster=0.7.10=py36h306e16b_0 12 | - anaconda=custom=py36hbbc8b67_0 13 | - anaconda-client=1.6.9=py36_0 14 | - anaconda-navigator=1.7.0=py36_0 15 | - anaconda-project=0.8.2=py36h44fb852_0 16 | - arrow-cpp=0.11.1=py36h5c3f529_1 17 | - asn1crypto=0.24.0=py36_0 18 | - astor=0.7.1=py36_0 19 | - astroid=1.6.1=py36_0 20 | - astropy=2.0.3=py36h14c3975_0 21 | - attrs=17.4.0=py36_0 22 | - babel=2.5.3=py36_0 23 | - backports=1.0=py36hfa02d7e_1 24 | - backports.shutil_get_terminal_size=1.0.0=py36hfea85ff_2 25 | - beautifulsoup4=4.6.0=py36h49b8c8c_1 26 | - bitarray=0.8.1=py36h14c3975_1 27 | - bkcharts=0.2=py36h735825a_0 28 | - blas=1.0=mkl 29 | - blaze=0.11.3=py36h4e06776_0 30 | - bokeh=0.12.13=py36h2f9c1c0_0 31 | - boto=2.48.0=py36h6e4cd66_1 32 | - bottleneck=1.2.1=py36haac1ea0_0 33 | - bzip2=1.0.6=h9a117a8_4 34 | - c-ares=1.15.0=h7b6447c_1 35 | - ca-certificates=2019.10.16=0 36 | - cairo=1.14.12=h8948797_3 37 | - certifi=2019.9.11=py36_0 38 | - chardet=3.0.4=py36h0f667ec_1 39 | - click=6.7=py36h5253387_0 40 | - cloudpickle=0.5.2=py36_1 41 | - clyent=1.2.2=py36h7e57e65_1 42 | - colorama=0.3.9=py36h489cec4_0 43 | - conda=4.7.12=py36_0 44 | - conda-build=3.4.1=py36_0 45 | - conda-env=2.6.0=h36134e3_1 46 | - conda-package-handling=1.6.0=py36h7b6447c_0 47 | - conda-verify=2.0.0=py36h98955d8_0 48 | - contextlib2=0.5.5=py36h6c84a62_0 49 | - cryptography=2.4.2=py36h1ba5d50_0 50 | - cudatoolkit=10.0.130=0 51 | - cudnn=7.6.0=cuda10.0_0 52 | - cupti=10.0.130=0 53 | - curl=7.63.0=hbc83047_1000 54 | - cycler=0.10.0=py36h93f1223_0 55 | - cython=0.27.3=py36h1860423_0 56 | - cytoolz=0.9.0=py36h14c3975_0 57 | - dask=0.16.1=py36_0 58 | - dask-core=0.16.1=py36_0 59 | - datashape=0.5.4=py36h3ad6b5c_0 60 | - dbus=1.13.2=h714fa37_1 61 | - decorator=4.2.1=py36_0 62 | - distributed=1.20.2=py36_0 63 | - docutils=0.14=py36hb0f60f5_0 64 | - entrypoints=0.2.3=py36h1aec115_2 65 | - et_xmlfile=1.0.1=py36hd6bccc3_0 66 | - expat=2.2.5=he0dffb1_0 67 | - fastcache=1.0.2=py36h14c3975_2 68 | - fastparquet=0.3.2=py36hdd07704_0 69 | - filelock=2.0.13=py36h646ffb5_0 70 | - flask-cors=3.0.3=py36h2d857d3_0 71 | - fontconfig=2.13.0=h9420a91_0 72 | - freetype=2.9.1=h8a8886c_1 73 | - fribidi=1.0.5=h7b6447c_0 74 | - get_terminal_size=1.0.0=haa9412d_0 75 | - gflags=2.2.2=he6710b0_0 76 | - glib=2.56.2=hd408876_0 77 | - glob2=0.6=py36he249c77_0 78 | - glog=0.3.5=hf484d3e_1 79 | - gmp=6.1.2=h6c8ec71_1 80 | - gmpy2=2.0.8=py36hc8893dd_2 81 | - graphite2=1.3.12=h23475e2_2 82 | - graphviz=2.40.1=h21bd128_2 83 | - gst-plugins-base=1.14.0=hbbd80ab_1 84 | - gstreamer=1.14.0=hb453b48_1 85 | - h5py=2.7.1=py36h3585f63_0 86 | - harfbuzz=1.8.8=hffaf4a1_0 87 | - hdf5=1.10.1=h9caa474_1 88 | - heapdict=1.0.0=py36_2 89 | - icu=58.2=h9c2bf20_1 90 | - idna=2.6=py36h82fb2a8_1 91 | - imageio=2.2.0=py36he555465_0 92 | - imagesize=0.7.1=py36h52d8127_0 93 | - intel-openmp=2018.0.0=hc7b2577_8 94 | - ipykernel=4.8.0=py36_0 95 | - ipython=6.2.1=py36h88c514a_1 96 | - ipython_genutils=0.2.0=py36hb52b0d5_0 97 | - ipywidgets=7.1.1=py36_0 98 | - isort=4.2.15=py36had401c0_0 99 | - ipywidgets=7.1.1=py36_0 [220/9077] 100 | - isort=4.2.15=py36had401c0_0 101 | - itsdangerous=0.24=py36h93cc618_1 102 | - jbig=2.1=hdba287a_0 103 | - jdcal=1.3=py36h4c697fb_0 104 | - jedi=0.11.1=py36_0 105 | - jinja2=2.10=py36ha16c418_0 106 | - joblib=0.14.0=py_0 107 | - jpeg=9b=h024ee3a_2 108 | - jsonschema=2.6.0=py36h006f8b5_0 109 | - jupyter=1.0.0=py36_4 110 | - jupyter_client=5.2.2=py36_0 111 | - jupyter_console=5.2.0=py36he59e554_1 112 | - jupyter_core=4.4.0=py36h7c827e3_0 113 | - jupyterlab=0.31.5=py36_0 114 | - jupyterlab_launcher=0.10.2=py36_0 115 | - keras-applications=1.0.6=py36_0 116 | - keras-preprocessing=1.0.5=py36_0 117 | - krb5=1.16.1=h173b8e3_7 118 | - lazy-object-proxy=1.3.1=py36h10fcdad_0 119 | - lenskit=0.7.0=py36h1aa3f02_0 120 | - libboost=1.67.0=h46d08c1_4 121 | - libcurl=7.63.0=h20c2e04_1000 122 | - libedit=3.1.20181209=hc058e9b_0 123 | - libevent=2.1.8=h1ba5d50_0 124 | - libffi=3.2.1=hd88cf55_4 125 | - libgcc-ng=9.1.0=hdf63c60_0 126 | - libgfortran-ng=7.2.0=h9f7466a_2 127 | - libopenblas=0.3.3=h5a2b251_3 128 | - libpng=1.6.37=hbc83047_0 129 | - libprotobuf=3.6.1=hd408876_0 130 | - libsodium=1.0.15=hf101ebd_0 131 | - libssh2=1.8.0=h1ba5d50_4 132 | - libstdcxx-ng=8.2.0=hdf63c60_1 133 | - libtiff=4.0.10=h2733197_2 134 | - libtool=2.4.6=h544aabb_3 135 | - libuuid=1.0.3=h1bed415_2 136 | - libxcb=1.13=h1bed415_1 137 | - libxml2=2.9.8=h26e45fe_1 138 | - libxslt=1.1.32=h1312cb7_0 139 | - llvmlite=0.28.0=py36hd408876_0 140 | - locket=0.2.0=py36h787c0ad_1 141 | - lxml=4.1.1=py36hf71bdeb_1 142 | - lz4-c=1.8.1.2=h14c3975_0 143 | - lzo=2.10=h49e0be7_2 144 | - markdown=3.0.1=py36_0 145 | - markupsafe=1.0=py36hd9260cd_1 146 | - mccabe=0.6.1=py36h5ad9710_1 147 | - mistune=0.8.3=py36_0 148 | - mkl=2019.4=243 149 | - mkl-service=2.3.0=py36he904b0f_0 150 | - mkl_fft=1.0.14=py36ha843d7b_0 151 | - mkl_random=1.1.0=py36hd6b4f25_0 152 | - mpc=1.0.3=hec55b23_5 153 | - mpfr=3.1.5=h11a74b3_2 154 | - mpmath=1.0.0=py36hfeacd6b_2 [165/9077] 155 | - msgpack-python=0.5.1=py36h6bb024c_0 156 | - multipledispatch=0.4.9=py36h41da3fb_0 157 | - navigator-updater=0.1.0=py36h14770f7_0 158 | - nbconvert=5.3.1=py36hb41ffb7_0 159 | - nbformat=4.4.0=py36h31c9010_0 160 | - nccl=1.3.5=cuda9.0_0 161 | - ncurses=6.1=he6710b0_1 162 | - networkx=2.1=py36_0 163 | - ninja=1.9.0=py36hfd86e86_0 164 | - nltk=3.2.5=py36h7532b22_0 165 | - nose=1.3.7=py36hcdf7029_2 166 | - notebook=5.4.0=py36_0 167 | - numba=0.43.1=py36h962f231_0 168 | - numexpr=2.6.4=py36hc4a3f9a_0 169 | - numpy-base=1.17.2=py36hde5b4d6_0 170 | - odo=0.5.1=py36h90ed295_0 171 | - olefile=0.46=py36_0 172 | - openpyxl=2.4.10=py36_0 173 | - openssl=1.1.1d=h7b6447c_3 174 | - packaging=16.8=py36ha668100_1 175 | - pandas=0.25.3=py36he6710b0_0 176 | - pandoc=1.19.2.1=hea2e7c5_1 177 | - pandocfilters=1.4.2=py36ha6701b7_1 178 | - pango=1.42.4=h049681c_0 179 | - parso=0.1.1=py36h35f843b_0 180 | - partd=0.3.8=py36h36fd896_0 181 | - patchelf=0.9=hf79760b_2 182 | - path.py=10.5=py36h55ceabb_0 183 | - pathlib2=2.3.0=py36h49efa8e_0 184 | - patsy=0.5.0=py36_0 185 | - pcre=8.42=h439df22_0 186 | - pep8=1.7.1=py36_0 187 | - pexpect=4.3.1=py36_0 188 | - pickleshare=0.7.4=py36h63277f8_0 189 | - pillow=6.1.0=py36h34e0f95_0 190 | - pixman=0.34.0=hceecf20_3 191 | - pkginfo=1.4.1=py36h215d178_1 192 | - pluggy=0.6.0=py36hb689045_0 193 | - ply=3.10=py36hed35086_0 194 | - prompt_toolkit=1.0.15=py36h17d85b1_0 195 | - psutil=5.4.3=py36h14c3975_0 196 | - ptyprocess=0.5.2=py36h69acd42_0 197 | - py=1.5.2=py36h29bf505_0 198 | - pyarrow=0.11.1=py36he6710b0_0 199 | - pycodestyle=2.3.1=py36hf609f19_0 200 | - pycosat=0.6.3=py36h0a5515d_0 201 | - pycparser=2.19=py36_0 202 | - pycrypto=2.6.1=py36h14c3975_7 203 | - pycurl=7.43.0.2=py36h1ba5d50_0 204 | - pyflakes=1.6.0=py36h7bd6a15_0 205 | - pygments=2.2.0=py36h0d3125c_0 206 | - pylint=1.8.2=py36_0 207 | - pyodbc=4.0.22=py36hf484d3e_0 208 | - pyopenssl=17.5.0=py36h20ba746_0 209 | - pyparsing=2.2.0=py36hee85983_1 210 | - pyqt=5.6.0=py36h0386399_5 211 | - pyparsing=2.2.0=py36hee85983_1 [110/9077] 212 | - pyqt=5.6.0=py36h0386399_5 213 | - pysocks=1.6.7=py36hd97a5b1_1 214 | - pytables=3.4.2=py36h3b5282a_2 215 | - pytest=3.3.2=py36_0 216 | - python=3.6.9=h265db76_0 217 | - python-dateutil=2.6.1=py36h88d3b88_1 218 | - python-snappy=0.5.4=py36he6710b0_0 219 | - pytorch=1.2.0=cuda100py36h938c94c_0 220 | - pytz=2017.3=py36h63b9c63_0 221 | - pywavelets=0.5.2=py36he602eb0_0 222 | - pyyaml=3.12=py36hafb9ca4_1 223 | - pyzmq=16.0.3=py36he2533c7_0 224 | - qt=5.6.3=h8bf5577_3 225 | - qtawesome=0.4.4=py36h609ed8c_0 226 | - qtconsole=4.3.1=py36h8f73b5b_0 227 | - qtpy=1.3.1=py36h3691cc8_0 228 | - readline=7.0=h7b6447c_5 229 | - requests=2.18.4=py36he2e5f8d_1 230 | - rope=0.10.7=py36h147e2ec_0 231 | - ruamel_yaml=0.15.35=py36h14c3975_1 232 | - scikit-image=0.13.1=py36h14c3975_1 233 | - scikit-learn=0.20.2=py36hd81dba3_0 234 | - scipy=1.2.0=py36h7c811a0_0 235 | - seaborn=0.8.1=py36hfad7ec4_0 236 | - send2trash=1.4.2=py36_0 237 | - setuptools=41.2.0=py36_0 238 | - simplegeneric=0.8.1=py36_2 239 | - singledispatch=3.4.0.3=py36h7a266c3_0 240 | - sip=4.18.1=py36h51ed4ed_2 241 | - six=1.12.0=py36_0 242 | - snappy=1.1.7=hbae5bb6_3 243 | - snowballstemmer=1.2.1=py36h6febd40_0 244 | - sortedcollections=0.5.3=py36h3c761f9_0 245 | - sortedcontainers=1.5.9=py36_0 246 | - sphinx=1.6.6=py36_0 247 | - sphinxcontrib=1.0=py36h6d0f590_1 248 | - sphinxcontrib-websupport=1.0.1=py36hb5cb234_1 249 | - spyder=3.2.6=py36_0 250 | - sqlalchemy=1.2.1=py36h14c3975_0 251 | - sqlite=3.29.0=h7b6447c_0 252 | - statsmodels=0.8.0=py36h8533d0b_0 253 | - sympy=1.1.1=py36hc6d1c1c_0 254 | - tblib=1.3.2=py36h34cf8b6_0 255 | - tensorflow=1.12.0=gpu_py36he68c306_0 256 | - tensorflow-base=1.12.0=gpu_py36h8e0ae2d_0 257 | - terminado=0.8.1=py36_1 258 | - testpath=0.3.1=py36h8cadb63_0 259 | - thrift=0.11.0=py36hf484d3e_0 260 | - thrift-cpp=0.11.0=h02b749d_3 261 | - tk=8.6.8=hbc83047_0 262 | - toolz=0.9.0=py36_0 263 | - torchvision=0.4.0=cuda100py36hecfc37a_0 264 | - torchvision-cpu=0.2.1=py36_1 265 | - tornado=4.5.3=py36_0 266 | - traitlets=4.3.2=py36h674d592_0 267 | - typing=3.6.2=py36h7da032a_0 268 | - traitlets=4.3.2=py36h674d592_0 [55/9077] 269 | - typing=3.6.2=py36h7da032a_0 270 | - unicodecsv=0.14.1=py36ha668878_0 271 | - unixodbc=2.3.4=hc36303a_1 272 | - urllib3=1.22=py36hbe7ace6_0 273 | - wcwidth=0.1.7=py36hdf4376a_0 274 | - webencodings=0.5.1=py36h800622e_1 275 | - werkzeug=0.14.1=py36_0 276 | - wheel=0.33.6=py36_0 277 | - widgetsnbextension=3.1.0=py36_0 278 | - wrapt=1.10.11=py36h28b7045_0 279 | - xlrd=1.1.0=py36h1db9f0c_1 280 | - xlsxwriter=1.0.2=py36h3de1aca_0 281 | - xlwt=1.3.0=py36h7b00a1f_0 282 | - xz=5.2.4=h14c3975_4 283 | - yaml=0.1.7=had09818_2 284 | - zeromq=4.2.2=hbedb6e5_2 285 | - zict=0.1.3=py36h3a3bf81_0 286 | - zlib=1.2.11=h7b6447c_3 287 | - zstd=1.3.7=h0b5b093_0 288 | - pip: 289 | - absl-py==0.7.0 290 | - allennlp==0.7.1 291 | - aws-xray-sdk==0.95 292 | - awscli==1.16.59 293 | - bleach==1.5.0 294 | - boto3==1.7.4 295 | - botocore==1.12.49 296 | - bz2file==0.98 297 | - cffi==1.11.2 298 | - conllu==0.11 299 | - cookies==2.2.1 300 | - cymem==2.0.2 301 | - deprecation==2.0.6 302 | - dill==0.2.8.2 303 | - docker==3.5.1 304 | - docker-pycreds==0.3.0 305 | - ecdsa==0.13 306 | - editdistance==0.5.2 307 | - fasttext==0.8.22 308 | - flaky==3.4.0 309 | - flask==0.12.4 310 | - ftfy==5.5.0 311 | - future==0.17.1 312 | - gast==0.2.2 313 | - gensim==3.4.0 314 | - gevent==1.3.6 315 | - gputil==1.4.0 316 | - greenlet==0.4.15 317 | - grpcio==1.18.0 318 | - gurobipy==8.1.1 319 | - hdbscan==0.8.18 320 | - html5lib==0.9999999 321 | - jmespath==0.9.3 322 | - jsondiff==1.1.1 323 | - jsonnet==0.10.0 324 | - jsonpickle==1.0 325 | - jsonnet==0.10.0 326 | - jsonpickle==1.0 327 | - kiwisolver==1.0.1 328 | - matplotlib==2.2.3 329 | - mock==2.0.0 330 | - moto==1.3.4 331 | - msgpack==0.5.6 332 | - msgpack-numpy==0.4.3.2 333 | - murmurhash==1.0.1 334 | - numpy==1.15.4 335 | - numpydoc==0.8.0 336 | - overrides==1.9 337 | - parsimonious==0.8.0 338 | - pbr==5.1.1 339 | - pip==19.0.1 340 | - plac==0.9.6 341 | - preshed==2.0.1 342 | - protobuf==3.3.0 343 | - py3-ortools==6.4.4495 344 | - pyaml==18.11.0 345 | - pyasn1==0.4.4 346 | - pybind11==2.2.4 347 | - pycryptodome==3.7.0 348 | - pyenchant==2.0.0 349 | - pyhocon==0.3.50 350 | - pympler==0.5 351 | - pynvml==8.0.3 352 | - python-graphviz==0.10.1 353 | - python-jose==2.0.2 354 | - regex==2018.1.10 355 | - responses==0.10.4 356 | - rsa==3.4.2 357 | - s3transfer==0.1.13 358 | - sklearn==0.0 359 | - smart-open==1.5.7 360 | - spacy==2.0.16 361 | - sqlparse==0.2.4 362 | - tensorboard==1.7.0 363 | - tensorboardx==1.2 364 | - tensorflow-gpu==1.7.0 365 | - tensorflow-hub==0.2.0 366 | - termcolor==1.1.0 367 | - thinc==6.12.0 368 | - torch==0.4.1 369 | - torchsummary==1.5.1 370 | - tqdm==4.28.1 371 | - ujson==1.35 372 | - unidecode==1.0.22 373 | - websocket-client==0.54.0 374 | - xmltodict==0.11.0 375 | -------------------------------------------------------------------------------- /src/eval/threshold.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2019 University of Massachusetts Amherst. 3 | This file is part of "expLinkage" 4 | http://github.com/iesl/expLinkage 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | """ 15 | 16 | import time 17 | import numpy as np 18 | import matplotlib 19 | matplotlib.use('Agg') 20 | import matplotlib.pyplot as plt 21 | from pathlib import Path 22 | 23 | 24 | # Recursive search 25 | # TODO: Change to just work for F1. Remove f1ToEval argument 26 | def choose_threshold(trainer, infMethod, epoch="0", canopies=None): 27 | 28 | f1ToEval="f1" 29 | if infMethod != "connComp" and (not infMethod.endswith("@t")): 30 | trainer.logger.info("Can not choose threshold for infMethod = {}".format(infMethod)) 31 | return 0. 32 | 33 | printLog = True 34 | if canopies is None: 35 | if len(trainer.devCanopies) != 0: 36 | canopies = trainer.devCanopies 37 | else: 38 | canopies = trainer.trainCanopies 39 | else: 40 | pass 41 | 42 | # Precison , recall and f1 to use when finding bestTHreshold. Alaternatively, we could use "connComp_muc_precision" etc 43 | if f1ToEval == "muc_f1": 44 | recallToUse = "{}_muc_recall".format(infMethod) 45 | precisionToUse = "{}_muc_precision".format(infMethod) 46 | elif f1ToEval == "f1": 47 | recallToUse = "{}_recall".format(infMethod) 48 | precisionToUse = "{}_precision".format(infMethod) 49 | else: 50 | recallToUse = None 51 | precisionToUse = None 52 | raise Exception("Invalid f1ToUse={} to choose threshold".format(f1ToEval)) 53 | 54 | start = time.time() 55 | trainer.logger.info("==" * 20 + "Beginning choosing threshold for method ={}".format(infMethod)) 56 | 57 | currThreshold = 0.128 58 | precision = 0 59 | allScores = {"{}_{}".format(infMethod, metric): {} for metric in ["precision","recall", "f1", "muc_precision","muc_recall", "muc_f1"]} 60 | f1Metric = "{}_{}".format(infMethod, f1ToEval) 61 | 62 | while precision != 1: 63 | scores = trainer.evalFunc(config=trainer.config, model=trainer.model, canopies=canopies, 64 | threshDict={infMethod:currThreshold}, inferenceMethods=[infMethod], metricsForEval=f1ToEval) 65 | 66 | precision = scores[precisionToUse][0] 67 | for metric in allScores: 68 | allScores[metric][currThreshold] = scores[metric][0] if metric in scores else 0 69 | 70 | if printLog: trainer.logger.info("Precision:{}\t Threshold:{:.3f}".format(precision, currThreshold)) 71 | if trainer.config.outDisSim: # Decreasing threshold to get better precision as model outputs distance 72 | if currThreshold < 0: 73 | currThreshold *= 2 # It is a negative number and making it smaller by multiplying it by 2 74 | elif currThreshold > 0.0001: 75 | currThreshold /= 2 # It is a positive number and making it smaller by dividing it by 2 76 | elif currThreshold == 0.0: 77 | currThreshold = -0.128 # Assign a small negative value to move away from zero 78 | else:# Switch over from very small positive to very small negative to continue making threshold smaller 79 | currThreshold = 0. 80 | 81 | else: # Increasing threshold to get better precision as model outputs similarity 82 | if currThreshold > 0: # It is already positive and make it larger by multiplying by 2 83 | currThreshold *= 2 84 | elif currThreshold < -0.0001: # If it is negative then make it larger by dividing by 2 85 | currThreshold /= 2 86 | elif currThreshold == 0.0: 87 | currThreshold = 0.128 # Assign a small positive value to move away from zero 88 | else: # Switch over from very small negative to very small positive to continue making threshold larger 89 | currThreshold = 0. 90 | 91 | 92 | 93 | bestRecall = -1 94 | theshForBestRecall = None 95 | for threshold in allScores[recallToUse]: 96 | if allScores[recallToUse][threshold] > bestRecall: 97 | bestRecall = allScores[recallToUse][threshold] 98 | theshForBestRecall = threshold 99 | 100 | if printLog: trainer.logger.info(" Best Recall:{:.3f}\t Threshold:{:.3f}".format(bestRecall, theshForBestRecall)) 101 | 102 | if bestRecall != 1: 103 | currRecall = bestRecall 104 | currThreshold = theshForBestRecall 105 | while currRecall != 1: 106 | scores = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=canopies, threshDict={infMethod:currThreshold}, 107 | inferenceMethods=[infMethod], metricsForEval=f1ToEval) 108 | for metric in allScores: 109 | allScores[metric][currThreshold] = scores[metric][0] if metric in scores else 0 110 | 111 | currRecall = allScores[recallToUse][currThreshold] 112 | if printLog: trainer.logger.info("Recall:{:.3f}\t Threshold:{:.3f}".format(currRecall, currThreshold)) 113 | 114 | if trainer.config.outDisSim: # Increasing threshold to get better recall as model outputs distance 115 | if currThreshold > 0: # If positive already, the multiply by 2 to make it larger 116 | currThreshold *= 2 117 | elif currThreshold < -0.0001: # It negative then divide by 2 to make it larger 118 | currThreshold /= 2 119 | elif currThreshold == 0: # Assign a small positive value to move away from zero 120 | currThreshold = 0.128 121 | else: # It too small negative, then switch over from negative to positive to continue making it larger 122 | currThreshold = 0. 123 | else: # Decrease threshold as it gives better recall 124 | if currThreshold < 0: # If negative, then making threshold smaller by n[multiplying it by 2 125 | currThreshold *= 2 126 | elif currThreshold > 0.0001: # If positive, then making threshold smaller by dividing it by 2 127 | currThreshold /= 2 128 | elif currThreshold == 0.0: # Assign a small negative value to move away from zero 129 | currThreshold = -0.128 130 | else: # It too small positive, then switch over from positive to negative to continue making it smaller 131 | currThreshold = 0. 132 | 133 | ''' Each time, I find threshold values between which the f1 score peaks. Then I try threshold values between those bounds 134 | and repeat the same procedure until: F1 at t1,t2 and (t1+t2)/2 is all same when rounded by 2 decimals or I have done this 135 | recursive search for more than 4 times 136 | ''' 137 | 138 | bestF1 = -1 139 | threshForBestF1 = None 140 | for threshold in allScores[f1Metric]: 141 | if allScores[f1Metric][threshold] > bestF1: 142 | bestF1 = allScores[f1Metric][threshold] 143 | threshForBestF1 = threshold 144 | 145 | if threshForBestF1 == 1: 146 | return threshForBestF1 147 | 148 | # Try random values between thresh for best recall and thresh for best precision 149 | if printLog: trainer.logger.info("AllScores:{}".format(allScores)) 150 | 151 | t1 = sorted(allScores[f1Metric].keys())[0] 152 | t2 = sorted(allScores[f1Metric].keys())[-1] 153 | numIntermediateThresh = 50 154 | thresholdValsToTry = np.arange(t1, t2, (t2 - t1) / numIntermediateThresh) 155 | if printLog: trainer.logger.info("Trying some additional threshold values between largest and smallest tried so far:{}".format(thresholdValsToTry)) 156 | for thresh in thresholdValsToTry: 157 | scores = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=canopies, threshDict={infMethod:thresh}, 158 | inferenceMethods=[infMethod], metricsForEval=f1ToEval) 159 | for metric in allScores: 160 | allScores[metric][thresh] = scores[metric][0] if metric in scores else 0 161 | 162 | numRecSearch = 0 163 | while numRecSearch <= 6: 164 | numRecSearch += 1 165 | thresholdVals = sorted(list(allScores[f1Metric].keys())) 166 | if len(thresholdVals) == 1: 167 | if printLog: trainer.logger.info("Best threshold found in just 1 attempt:{}\t{}".format(thresholdVals[0], allScores[f1Metric])) 168 | break 169 | 170 | assert len(thresholdVals) >= 2 171 | 172 | bestThreshold = thresholdVals[0] 173 | for threshTried in thresholdVals: # Choose threshold that gave best F1 on dev set 174 | if printLog: trainer.logger.info("{}\tThreshold:{:.3f}\tF1:{:.6f}".format(numRecSearch, threshTried, allScores[f1Metric][threshTried])) 175 | if allScores[f1Metric][threshTried] >= allScores[f1Metric][bestThreshold]: 176 | bestThreshold = threshTried 177 | 178 | lowerThreshold = thresholdVals[0] 179 | upperThreshold = thresholdVals[-1] 180 | 181 | prevThreshold = None 182 | for threshTried in thresholdVals: 183 | if prevThreshold == bestThreshold: 184 | upperThreshold = threshTried # Threshold immediately AFTER the threshold that gives best F1 185 | 186 | if threshTried == bestThreshold: 187 | # Threshold immediately BEFORE the threshold that gives best F1 188 | lowerThreshold = prevThreshold if prevThreshold is not None else bestThreshold 189 | 190 | prevThreshold = threshTried 191 | 192 | # Push upperThreshold to as large as possible such that it still stays immediately next to best F1 193 | thresholdVals = sorted(thresholdVals) 194 | for ctr, threshold in enumerate(thresholdVals): 195 | if threshold < upperThreshold: continue 196 | if allScores[f1Metric][upperThreshold] == allScores[f1Metric][bestThreshold]: 197 | if ctr < len(thresholdVals) - 1: 198 | upperThreshold = thresholdVals[ctr + 1] 199 | else: 200 | break 201 | 202 | # Push lowerThreshold to as large as possible such that it still stays immediately before best F1 203 | # thresholdVals = sorted(thresholdVals, reverse=True) 204 | # for ctr,threshold in enumerate(thresholdVals): 205 | # if threshold > lowerThreshold: continue 206 | # if allScores[f1ToUse][lowerThreshold] == allScores[f1ToUse][bestThreshold]: 207 | # if ctr < len(thresholdVals)-1: 208 | # lowerThreshold = thresholdVals[ctr+1] 209 | # else: 210 | # break 211 | 212 | if printLog: trainer.logger.info("Upper Threshold:{:.3f} Lower Threshold:{:.3f}".format(upperThreshold, lowerThreshold)) 213 | 214 | numIntermediateThresh = int(20 / numRecSearch) 215 | thresholdValsToTry = np.arange(lowerThreshold, upperThreshold, 216 | (upperThreshold - lowerThreshold) / numIntermediateThresh) 217 | if printLog: trainer.logger.info("Threshold Values to try:{}".format(["{:.3f}".format(x) for x in thresholdValsToTry])) 218 | for currThreshold in thresholdValsToTry: 219 | scores = trainer.evalFunc(config=trainer.config,model=trainer.model, canopies=canopies, threshDict={infMethod:currThreshold}, 220 | inferenceMethods=[infMethod], metricsForEval=f1ToEval) 221 | for metric in allScores: 222 | allScores[metric][currThreshold] = scores[metric][0] if metric in scores else 0 223 | if currThreshold not in allScores[f1Metric]: 224 | pass 225 | 226 | midThreshold = (lowerThreshold + upperThreshold) / 2 227 | if printLog: trainer.logger.info("Mid Threshold:{:.3f}".format(midThreshold)) 228 | if midThreshold not in allScores[f1Metric]: 229 | scores = trainer.evalFunc(config=trainer.config, model=trainer.model, canopies=canopies, threshDict={infMethod:midThreshold}, 230 | inferenceMethods=[infMethod], metricsForEval=f1ToEval) 231 | for metric in allScores: 232 | allScores[metric][midThreshold] = scores[metric][0] if metric in scores else 0 233 | 234 | 235 | if (round(allScores[f1Metric][upperThreshold], 3) == round(allScores[f1Metric][lowerThreshold], 3)) \ 236 | and (round(allScores[f1Metric][midThreshold], 3) == round(allScores[f1Metric][lowerThreshold], 3)): 237 | trainer.logger.info("Stopping as F1 at upperThreshold, lowerThreshold and midThreshold is same upto 3 decimal places") 238 | break 239 | 240 | # Choose bestThreshold from all the threshold values tried so far 241 | thresholdVals = sorted(list(allScores[f1Metric].keys())) 242 | bestThreshold = thresholdVals[0] 243 | for threshTried in allScores[f1Metric]: # Choose threshold that gave best F1 on dev set 244 | if allScores[f1Metric][threshTried] >= allScores[f1Metric][bestThreshold]: 245 | bestThreshold = threshTried 246 | 247 | end = time.time() 248 | threshTried = sorted(list(allScores[f1Metric].keys())) 249 | if printLog: trainer.logger.info("Tried {} threshold values. Threshold tried:{}".format(len(allScores[f1Metric]), ",".join(["{:.3f}\t{:.6f}\n".format(x, allScores[f1Metric][x]) for x in threshTried]))) 250 | trainer.logger.info("Time taken for choosing threshold={:.3f} with {} = {:.4f} is {:.3f}".format(bestThreshold, f1Metric, allScores[f1Metric][bestThreshold], end - start)) 251 | trainer.logger.info("==" * 20 + "\n") 252 | 253 | Path(trainer.config.resultDir + "/chooseThresh").mkdir(parents=True, exist_ok=True) # Create resultDir directory if not already present 254 | 255 | for metric in [f1Metric]: 256 | plt.clf() 257 | X = sorted(allScores[metric].keys()) 258 | Y = [allScores[metric][x] for x in X] 259 | plt.plot(X, Y, 'ro-') 260 | plt.plot([bestThreshold], [allScores[metric][bestThreshold]], 'b*') 261 | plt.xlabel("Threshold") 262 | plt.ylabel("{} {}".format(infMethod, metric)) 263 | plt.grid() 264 | plt.title("{} vs Threshold".format(metric)) 265 | plt.savefig(trainer.config.resultDir + "/chooseThresh/{}_{}_{}.png".format(infMethod, metric, epoch)) 266 | plt.close() 267 | 268 | return bestThreshold 269 | --------------------------------------------------------------------------------